Set up

source("../functions.R")
tol12qualitative=c("#332288", "#6699CC", "#88CCEE", "#44AA99", "#117733", "#999933", "#DDCC77", "#661100", "#CC6677", "#AA4466", "#882255", "#AA4499")
if (!file.exists("output")) {
  system("mkdir output")
}
parallel = TRUE
ncores = 10

Load packages

if (!require(DCARS)) {
  library(devtools)
  devtools::install_github("shazanfar/DCARS")
}
library(DCARS)
library(ggplot2)
library(gplots)
library(SingleCellExperiment)
library(scater)
library(scran)
library(reshape)
library(scattermore)
library(dynamicTreeCut)
library(ComplexHeatmap)
library(GO.db)
library(org.Mm.eg.db)
library(stringr)
library(matrixStats)
library(ggforce)
library(patchwork)
library(ggpubr)
library(cowplot)
library(parallel)
library(monocle)
library(ggthemes)
library(igraph)
library(gtools)
library(plotly)
library(ggrepel)
library(ggridges)
library(M3Drop)
library(scMerge)
library(corrplot)
library(circlize)
library(GGally)
library(corrplot)
library(UpSetR)
library(minerva)
library(energy)

Process Developmental liver data from URL

if (!file.exists("output/liver_pseudotime.Rds")) {
  
  # data webpage https://sydneybiox.github.io/scMerge/articles/Mouse_Liver_Data/Mouse_Liver_Data.html
  # data file URL http://www.maths.usyd.edu.au/u/yingxinl/wwwnb/scMergeData/liver_scMerge.rds
  # might take a minute to download
  liver_scMerge = readRDS(url("http://www.maths.usyd.edu.au/u/yingxinl/wwwnb/scMergeData/liver_scMerge.rds"))
  
  dim(assay(liver_scMerge,"scMerge"))
  assay(liver_scMerge,"scMerge")[1:5,1:5]
  table(liver_scMerge$cellTypes, liver_scMerge$batch)
  
  hepa_cellTypes = c("hepatoblast/hepatocyte","cholangiocyte")
  liver_scMerge$isHepa = ifelse(liver_scMerge$cellTypes %in% hepa_cellTypes, "Hepa", "Other")
  
  liver_scMerge <- runPCA(liver_scMerge, exprs_values = "scMerge", ncomponents = 50)
  set.seed(484)
  liver_scMerge <- runTSNE(liver_scMerge, use_dimred = "PCA")
  tsne_hepa = reducedDim(liver_scMerge, "TSNE")[liver_scMerge$isHepa == "Hepa",]
  bounding_df = data.frame(
    x = 1.1*range(tsne_hepa[,1])[c(1,2,2,1,1)],
    y = 1.1*range(tsne_hepa[,2])[c(1,1,2,2,1)]
  )
  tsne_df = as.data.frame(cbind(
    colData(liver_scMerge),
    reducedDim(liver_scMerge, "TSNE")[,1:2]
  ))
  
  g = ggplot(tsne_df, aes(x = V1, y = V2, colour = cellTypes)) +
    geom_point(size = 2.5) +
    theme_classic() +
    theme(legend.position = "bottom") +
    xlab("TSNE 1") +
    ylab("TSNE 2") +
    theme(axis.title = element_text(size = 30)) +
    theme(axis.ticks = element_blank()) +
    theme(axis.text = element_blank()) +
    theme(legend.text = element_text(size = 10)) +
    scale_colour_gdocs() +
    geom_path(data = bounding_df, aes(x = x, y = y), 
              linetype = "dashed", colour = "red",
              size = 1) +
    guides(colour=guide_legend(nrow=3,byrow=TRUE)) +
    NULL
  g
  ggsave(g, file = "output/UMAP_liver_scMerge.pdf", height = 7.5, width = 7)
  
  liver_scMerge$Estage = regmatches(colnames(liver_scMerge), regexpr("E[0-9][0-9].[0-9]|E[0-9].[0-9]", colnames(liver_scMerge)))
  
  table(liver_scMerge$Estage, liver_scMerge$isHepa, liver_scMerge$batch)
  
  tfac = list(liver_scMerge$Estage, liver_scMerge$batch)
  liver_scMerge$width = unsplit(tapply(rep(1,ncol(liver_scMerge)), tfac, sum), tfac)
  liver_scMerge$width_transformed = sqrt(liver_scMerge$width)
  liver_scMerge$isHepaSwitch = factor(liver_scMerge$isHepa, levels = c("Other","Hepa"))
  liver_scMerge$Estage = factor(liver_scMerge$Estage, levels = gtools::mixedsort(unique(liver_scMerge$Estage)))
  
  liver_scMerge2 = liver_scMerge[,liver_scMerge$batch %in% subset(colData(liver_scMerge), isHepa == "Hepa")$batch]
  
  g = ggplot(as.data.frame(colData(liver_scMerge2)),
             aes(y = 1, fill = isHepaSwitch)) +
    geom_bar(aes(width = width_transformed, x = 0 + width_transformed/2), 
             stat = "identity", position = "fill") +
    coord_polar("y", start = 0) +
    facet_grid(Estage ~ batch, switch = "y") +
    theme_minimal() +
    theme(strip.text.y = element_text(angle = 180)) +
    theme(strip.text.x = element_text(angle = 90)) +
    theme(panel.grid = element_blank()) +
    theme(axis.text = element_blank()) +
    theme(strip.placement = "inside") +
    theme(legend.position = "bottom") +
    theme(legend.direction = "vertical") +
    guides(fill = guide_legend(reverse = TRUE)) +
    labs(fill = "") +
    scale_fill_manual(values = c("Hepa" = "red", "Other" = "black"),
                      labels = c("Hepa" = "Hepatic cells", "Other" = "Other cells")) +
    theme(strip.text.y = element_text(size = 13)) +
    theme(strip.text.x = element_text(size = 15)) +
    theme(legend.text = element_text(size = 20)) +
    xlab("") +
    ylab("") +
    NULL
  g
  ggsave(g, file = "output/hepatic_cells_piechart.pdf", height = 8, width = 6)
  
  table(liver_scMerge$cellTypes,liver_scMerge$batch)
  
  ggplot(melt(table(liver_scMerge$cellTypes,liver_scMerge$batch)),
         aes(x = Var.2, group = Var.1, y = value, fill = Var.1)) + geom_bar(stat = "identity") +
    theme_minimal() +
    scale_fill_brewer(palette = "Spectral")
  
  ggplot(melt(table(liver_scMerge$cellTypes,liver_scMerge$batch)),
         aes(x = Var.1, group = Var.2, y = value, fill = Var.2)) + geom_bar(stat = "identity") +
    theme_minimal() +
    scale_fill_brewer(palette = "RdBu")
  
  hepa_scMerge = liver_scMerge[,liver_scMerge$isHepa == "Hepa"]
  
  var.fit <- trendVar(hepa_scMerge, assay.type = "scMerge", parametric=TRUE, loess.args=list(span=0.3), use.spikes = FALSE)
  var.out <- decomposeVar(hepa_scMerge, var.fit, assay.type = "scMerge")
  
  pdf(file = "output/HVG_selection.pdf", height = 8, width = 8)
  plot(var.out$mean, var.out$total, pch=16, cex=0.6, xlab="Mean log-expression", 
       ylab="Variance of log-expression")
  curve(var.fit$trend(x), col="dodgerblue", lwd=2, add=TRUE)
  
  seqvals = seq(min(var.out$mean), max(var.out$mean), length.out = 1000)
  peakExp = seqvals[which.max(var.fit$trend(seqvals))]
  
  hvg.out <- var.out[which(var.out$FDR <= 0.05 & var.out$mean > peakExp),]
  nrow(hvg.out)
  hvg.out <- hvg.out[order(hvg.out$bio, decreasing=TRUE),] 
  points(var.out$mean[which(var.out$FDR <= 0.05 & var.out$mean > peakExp)], 
         var.out$total[which(var.out$FDR <= 0.05 & var.out$mean > peakExp)], col="red", pch=16)
  abline(v = peakExp, lty = 2, col = "black")
  dev.off()
  
  head(hvg.out)
  
  HVG = sort(rownames(hvg.out))
  
  length(HVG)
  
  geneNames <- data.frame(gene_short_name=rownames(hepa_scMerge))
  rownames(geneNames) <- rownames(hepa_scMerge)
  
  fd <- new("AnnotatedDataFrame", data = geneNames)
  pd <- data.frame(cellType = hepa_scMerge$cellTypes,batch = hepa_scMerge$batch)
  rownames(pd) <- colnames(hepa_scMerge)
  pd <- new("AnnotatedDataFrame", data = pd)
  
  exprMat <- assay(hepa_scMerge, "scMerge")
  liver <- newCellDataSet(exprMat, expressionFamily = uninormal(), phenoData = pd, featureData=fd)
  liver <- estimateSizeFactors(liver)
  
  liver <- setOrderingFilter(liver,HVG)
  liver <- reduceDimension(liver, max_components=2, method = "DDRTree",norm_method ="none")
  liver <- orderCells(liver)
  table(pData(liver)$cellType,pData(liver)$State)
  
  # Ahsg gene is meant to be lowly expressed in hepatoblasts and then highly expressed in hepatocytes
  # thus we can tell that the lower branch is the root state
  root_state = levels(pData(liver)$State)[which.min(tapply(exprMat["Ahsg",],pData(liver)$State, mean))]
  chol_state = levels(pData(liver)$State)[which.max(tapply(exprMat["Epcam",],pData(liver)$State, mean))]
  hep_state = setdiff(levels(pData(liver)$State), c(root_state, chol_state))
  
  states = c("Hepatoblast","Cholangiocyte","Hepatocyte")
  names(states) = c(root_state, chol_state, hep_state)
  
  g1 = plot_cell_trajectory(liver, color_by="State", cell_size = 3, x = 2, y = 1) +
    scale_y_reverse() +
    theme(axis.ticks = element_blank()) +
    theme(axis.text = element_blank()) +
    theme(axis.title = element_text(size = 20)) +
    theme(legend.position = "bottom") +
    scale_color_tableau(labels = states) +
    theme(legend.text = element_text(size = 17)) +
    labs(color = "") +
    guides(color = guide_legend(reverse = TRUE)) +
    NULL
  g1
  ggsave(g1, file = "output/hepa_monocle.pdf",height = 6, width = 6)
  
  # add bounding boxes to the monocle graphs
  liver$nameStates = states[as.character(liver$State)]
  dimred_liver = t(liver@reducedDimS)
  expandRange = function(range, diff = 0.5){
    return(c(range[1]-diff, range[2] + diff))
  }
  bounding_df_liver_first = data.frame(
    x = expandRange(range(dimred_liver[liver$nameStates %in% c("Hepatoblast"),1]))[c(1,2,2,1,1)],
    y = expandRange(range(dimred_liver[liver$nameStates %in% c("Hepatoblast"),2]), diff = 0.5)[c(1,1,2,2,1)]
  )
  g11 = g1 + geom_path(data = bounding_df_liver_first, aes(x = y, y = x), 
                       linetype = "dashed", colour = "red",
                       size = 1) +
    NULL
  g11  
  ggsave(g11, file = "output/hepa_monocle_box_first.pdf", height = 6, width = 6)
  
  bounding_df_liver_hep = data.frame(
    x = expandRange(range(dimred_liver[liver$nameStates %in% c("Hepatoblast", "Hepatocyte"),1]))[c(1,2,2,1,1)],
    y = expandRange(range(dimred_liver[liver$nameStates %in% c("Hepatoblast", "Hepatocyte"),2]), diff = 0.5)[c(1,1,2,2,1)]
  )
  g12 = g1 + geom_path(data = bounding_df_liver_hep, aes(x = y, y = x), 
                       linetype = "dashed", colour = "red",
                       size = 1) +
    NULL
  g12
  ggsave(g12, file = "output/hepa_monocle_box_hep.pdf", height = 6, width = 6)
  
  # highlight where the 5 points are
  pointvals = round(seq(1,sum(liver$nameStates %in% c("Hepatoblast", "Hepatocyte")), length.out = 5))
  ps = liver$Pseudotime
  names(ps) <- colnames(liver)
  ps <- ps[liver$nameStates %in% c("Hepatoblast", "Hepatocyte")]
  pointnames = names(sort(ps))[pointvals]
  
  ps_df = pData(liver)[pointnames,]
  ps_df$Component1 = t(liver@reducedDimS[,pointnames])[,1]
  ps_df$Component2 = t(liver@reducedDimS[,pointnames])[,2]
  
  g12plus = g12 + geom_point(data = ps_df, aes(x = Component2, y = Component1),
                             inherit.aes = FALSE, size = 3)
  g12plus
  ggsave(g12plus, file = "output/hepa_monocle_box_hep_pointdots.pdf", height = 6, width = 6)
  
  bounding_df_liver_chol = data.frame(
    x = expandRange(range(dimred_liver[liver$nameStates %in% c("Hepatoblast", "Cholangiocyte"),1]))[c(1,2,2,1,1)],
    y = expandRange(range(dimred_liver[liver$nameStates %in% c("Hepatoblast", "Cholangiocyte"),2]), diff = 0.5)[c(1,1,2,2,1)]
  )
  g13 = g1 + geom_path(data = bounding_df_liver_chol, aes(x = y, y = x), 
                       linetype = "dashed", colour = "red",
                       size = 1) +
    NULL
  g13  
  ggsave(g13, file = "output/hepa_monocle_box_chol.pdf", height = 6, width = 6)
  
  g2 = plot_cell_trajectory(liver, color_by="batch")
  g3 = plot_cell_trajectory(liver, color_by="cellType")
  plottingmarkergenes = c("Gapdh","Sall4", "Epcam","Ahsg")
  
  sapply(plottingmarkergenes, function(mark) {
    g4 = plot_cell_trajectory(liver, markers=mark, 
                              use_color_gradient = TRUE, 
                              begin = 0, end = 1,
                              markers_linear = TRUE,
                              cell_size = 3,
                              x = 2, y = 1) + 
      scale_y_reverse() +
      xlab("") + ylab("") +
      theme(axis.ticks = element_blank()) +
      theme(axis.text = element_blank()) +
      theme(axis.title = element_text(size = 20)) +
      theme(strip.text = element_text(size = 30, face = "italic")) +
      theme(legend.position = "bottom") +
      theme(legend.text = element_text(size = 17)) +
      theme(legend.key.width = unit(1, "inches")) +
      theme(legend.key.height = unit(0.1, "inches")) +
      scale_color_viridis_c(breaks = c(0,10),
                            limits = c(0,10),
                            labels = c("Low","High")) +
      scale_color_viridis_c() +
      labs(color = "Expression") +
      xlab(ifelse(mark == "Gapdh", "Expression", " ")) +
      xlab("") +
      guides(color = guide_colourbar(title.position = "top",
                                     title.hjust = 0.5)) +
      theme(legend.position = ifelse(mark == "Gapdh", "bottom", "none")) +
      theme(legend.title = element_text(size = 20)) +
      NULL
    g4
    if (mark == "Gapdh") {
      g4 = g4 + scale_color_viridis_c(breaks = c(0,13),
                                      limits = c(0,13),
                                      labels = c("Low","High")) +
        NULL
      gg4 = as_ggplot(get_legend(g4))
      g4 = gg4
    }
    ggsave(g4, file = paste0("output/hepa_monocle_", mark, ".pdf"),height = 6, width = 6)
  })
  
  liver$Estage = regmatches(colnames(liver), regexpr("E[0-9][0-9].[0-9]", colnames(liver)))
  g6 = plot_cell_trajectory(liver, color_by="Estage", cell_size = 3,
                            x = 2, y = 1) +
    scale_y_reverse() +
    theme(axis.ticks = element_blank()) +
    theme(axis.text = element_blank()) +
    theme(axis.title = element_text(size = 20)) +
    theme(legend.position = "top") +
    scale_colour_brewer(palette = "Oranges") +
    xlab("") + ylab("") +
    theme(legend.text = element_text(size = 20)) +
    labs(color = "") +
    NULL
  g6
  ggsave(g6, file = "output/hepa_monocle_Estage.pdf",height = 6, width = 6)
  
  g5 = plot_cell_trajectory(liver, markers="Epcam", 
                            use_color_gradient = TRUE, 
                            begin = 0, end = 1,
                            markers_linear = TRUE)
  
  
  pData(liver)$TrajectoryState = states[as.character(pData(liver)$State)]
  
  liver <- orderCells(liver,root_state = root_state)
  
  liver
  
  saveRDS(root_state, file = "output/root_state.Rds")
  saveRDS(HVG, file = "output/HVG.Rds")
  saveRDS(liver, file = "output/liver_pseudotime.Rds")
} else {
  liver = readRDS("output/liver_pseudotime.Rds")
  HVG = readRDS("output/HVG.Rds")
  root_state = readRDS("output/root_state.Rds")
}

Build specific liver data matrices

if (!file.exists("output/liver_branch_data.RData")) {
  head(pData(liver))
  
  plot(pData(liver)$Pseudotime, pData(liver)$State)
  
  # expressed_genes
  # require at least 20% of cells to express at least 3
  hist(apply(liver,1,quantile, 0.9))
  liver_hepa_genes = sort(union(rownames(liver)[apply(liver,1,quantile, 0.8) > 5], HVG))
  length(liver_hepa_genes)
  
  liver_expressed = liver[liver_hepa_genes,]
  
  # order cells based on pseudotime
  # decided what are the values based on manual checking of the States
  # manu check shows state 3 is stem, state 2 is hep, state 1 is chol
  branch_cells_hep = pData(liver_expressed)[pData(liver_expressed)$State %in% c(root_state, hep_state),]
  branch_cells_chol = pData(liver_expressed)[pData(liver_expressed)$State %in% c(root_state, chol_state),]
  
  # obtain the name of cells that represents the ranking
  branch_cells_order_hep = rownames(sort_df(branch_cells_hep,"Pseudotime"))
  branch_cells_order_chol = rownames(sort_df(branch_cells_chol,"Pseudotime"))
  
  # Generate the gene expression for the hep and chol branches
  liver_branch_hep = exprs(liver_expressed[,branch_cells_order_hep])
  liver_branch_chol = exprs(liver_expressed[,branch_cells_order_chol])
  
  # retain the pseudotime estimates
  liver_pseudotime = liver$Pseudotime
  names(liver_pseudotime) <- colnames(liver)
  liver_pseudotime_hep = liver_pseudotime[colnames(liver_branch_hep)]
  liver_pseudotime_chol = liver_pseudotime[colnames(liver_branch_chol)]
  
  save(liver_pseudotime_hep, liver_pseudotime_chol, 
       liver_branch_hep, liver_branch_chol,
       file = "output/liver_branch_data.RData")
} else {
  load("output/liver_branch_data.RData")
}

DE with the pseudotime as the response

if (!file.exists("output/DE_liver.RData")) {
  DE_hep_raw = apply(liver_branch_hep,1,function(x){
    print("testing")
    fit3 = lm(x ~ liver_pseudotime_hep + I(liver_pseudotime_hep^2))
    fit2 = lm(x ~ liver_pseudotime_hep)
    fit1 = lm(x ~ 1)
    f_21 = anova(fit2,fit1)$`Pr(>F)`[2]
    f_32 = anova(fit3,fit2)$`Pr(>F)`[2]
    return(min(c(1, nrow(liver_branch_hep)*f_21, nrow(liver_branch_hep)*f_32)))
  } )
  
  DE_chol_raw = apply(liver_branch_chol,1,function(x){
    print("testing")
    fit3 = lm(x ~ liver_pseudotime_chol + I(liver_pseudotime_chol^2))
    fit2 = lm(x ~ liver_pseudotime_chol)
    fit1 = lm(x ~ 1)
    f_21 = anova(fit2,fit1)$`Pr(>F)`[2]
    f_32 = anova(fit3,fit2)$`Pr(>F)`[2]
    return(min(c(1, nrow(liver_branch_chol)*f_21, nrow(liver_branch_chol)*f_32)))
  } )
  
  nonDE_HVG_hep = intersect(HVG, names(which(DE_hep_raw > 0.05)))
  nonDE_HVG_chol = intersect(HVG, names(which(DE_chol_raw > 0.05)))
  
  save(DE_hep_raw, DE_chol_raw, nonDE_HVG_hep, nonDE_HVG_chol, file = "output/DE_liver.RData")
} else {
  load("output/DE_liver.RData")
}

Perform scHOT variability testing for initial branch of hepatoblasts

weightedVarMatrixStats = function(x, y, w) {
  require(matrixStats)
  weightedVar(x,w)
}

n_first = length(intersect(colnames(liver_branch_chol), colnames(liver_branch_hep)))

W_first = weightMatrix(n_first, span = 0.5)
W_chol = weightMatrix(ncol(liver_branch_chol), span = 0.5*n_first/ncol(liver_branch_chol))
W_hep = weightMatrix(ncol(liver_branch_hep), span = 0.5*n_first/ncol(liver_branch_hep))

scHOT variability test for hepatocyte branch

if (!file.exists("output/DV_hep.RData")) {
  set.seed(500)
  same_pairs_hep = cbind(nonDE_HVG_hep, nonDE_HVG_hep)
  
  hep_DV_stats = DCARSacrossNetwork(liver_branch_hep,
                                    edgelist = same_pairs_hep,
                                    W = W_hep,
                                    weightedConcordanceFunction = weightedVarMatrixStats,
                                    verbose = TRUE,
                                    extractTestStatisticOnly = TRUE)
  names(hep_DV_stats) <- nonDE_HVG_hep
  
  hep_DV_wVars = t(DCARSacrossNetwork(liver_branch_hep,
                                      edgelist = same_pairs_hep,
                                      W = W_hep,
                                      weightedConcordanceFunction = weightedVarMatrixStats,
                                      verbose = TRUE,
                                      extractWcorSequenceOnly = TRUE))
  rownames(hep_DV_wVars) <- nonDE_HVG_hep
  
  hep_DV_permstats_raw = DCARSacrossNetwork(liver_branch_hep,
                                            edgelist = same_pairs_hep,
                                            W = W_hep,
                                            weightedConcordanceFunction = weightedVarMatrixStats,
                                            verbose = TRUE,
                                            extractPermutationTestStatistics = TRUE,
                                            niter = 500)
  hep_DV_permstats = lapply(hep_DV_permstats_raw, unlist)
  names(hep_DV_permstats) <- nonDE_HVG_hep
  
  hep_DV_pvals = sapply(1:length(hep_DV_stats), function(i) {
    mean(hep_DV_permstats[[i]] >= hep_DV_stats[i])
  })
  names(hep_DV_pvals) <- nonDE_HVG_hep
  
  hep_DV_fdr = p.adjust(hep_DV_pvals, method = "BH")
  
  save(W_hep, same_pairs_hep, hep_DV_stats, hep_DV_wVars, hep_DV_permstats, hep_DV_pvals, hep_DV_fdr,
       file = "output/DV_hep.RData")
} else {
  load("output/DV_hep.RData")
}

scHOT variability test for cholangiocyte branch

if (!file.exists("output/DV_chol.RData")) {
  set.seed(500)
  same_pairs_chol = cbind(nonDE_HVG_chol, nonDE_HVG_chol)
  
  chol_DV_stats = DCARSacrossNetwork(liver_branch_chol,
                                     edgelist = same_pairs_chol,
                                     W = W_chol,
                                     weightedConcordanceFunction = weightedVarMatrixStats,
                                     verbose = TRUE,
                                     extractTestStatisticOnly = TRUE)
  names(chol_DV_stats) <- nonDE_HVG_chol
  
  chol_DV_wVars = t(DCARSacrossNetwork(liver_branch_chol,
                                       edgelist = same_pairs_chol,
                                       W = W_chol,
                                       weightedConcordanceFunction = weightedVarMatrixStats,
                                       verbose = TRUE,
                                       extractWcorSequenceOnly = TRUE))
  rownames(chol_DV_wVars) <- nonDE_HVG_chol
  
  chol_DV_permstats_raw = DCARSacrossNetwork(liver_branch_chol,
                                             edgelist = same_pairs_chol,
                                             W = W_chol,
                                             weightedConcordanceFunction = weightedVarMatrixStats,
                                             verbose = TRUE,
                                             extractPermutationTestStatistics = TRUE,
                                             niter = 500)
  chol_DV_permstats = lapply(chol_DV_permstats_raw, unlist)
  names(chol_DV_permstats) <- nonDE_HVG_chol
  
  chol_DV_pvals = sapply(1:length(chol_DV_stats), function(i) {
    mean(chol_DV_permstats[[i]] >= chol_DV_stats[i])
  })
  names(chol_DV_pvals) <- nonDE_HVG_chol
  
  chol_DV_fdr = p.adjust(chol_DV_pvals, method = "BH")
  
  save(W_chol, same_pairs_chol, chol_DV_stats, chol_DV_wVars, chol_DV_permstats, chol_DV_pvals, chol_DV_fdr,
       file = "output/DV_chol.RData")
} else {
  load("output/DV_chol.RData")
}

Extract data for first branch and identify genes to test

if (!file.exists("output/first_data.RData")) {
  first = exprs(liver[,intersect(colnames(liver_branch_hep), colnames(liver_branch_chol))])
  first_pseudotime = liver_pseudotime_chol[intersect(colnames(liver_branch_hep), colnames(liver_branch_chol))]
  
  var.fit <- trendVar(first, parametric = FALSE, loess.args=list(span=0.3), min.mean = 1.5, method = "loess")
  var.out <- decomposeVar(first, var.fit)
  plot(var.out$mean, var.out$total, pch=16, cex=0.6, xlab="Mean log-expression", 
       ylab="Variance of log-expression")
  curve(var.fit$trend(x), col="dodgerblue", lwd=2, add=TRUE)
  seqvals = seq(min(var.out$mean), max(var.out$mean), length.out = 1000)
  peakExp = seqvals[which.max(var.fit$trend(seqvals))]
  hvg.out <- var.out[which(var.out$FDR <= 0.05 & var.out$mean > peakExp),]
  nrow(hvg.out)
  hvg.out <- hvg.out[order(hvg.out$bio, decreasing=TRUE),] 
  points(var.out$mean[which(var.out$FDR <= 0.05 & var.out$mean > peakExp)], 
         var.out$total[which(var.out$FDR <= 0.05 & var.out$mean > peakExp)], col="red", pch=16)
  abline(v = peakExp, lty = 2, col = "black")
  head(hvg.out)
  HVG_first = sort(rownames(hvg.out))
  
  DE_first_raw = apply(first,1,function(x){
    print("testing")
    fit3 = lm(x ~ first_pseudotime + I(first_pseudotime^2))
    fit2 = lm(x ~ first_pseudotime)
    fit1 = lm(x ~ 1)
    f_21 = anova(fit2,fit1)$`Pr(>F)`[2]
    f_32 = anova(fit3,fit2)$`Pr(>F)`[2]
    return(min(c(1, nrow(first)*f_21, nrow(first)*f_32)))
  } )
  
  nonDE_HVG_first = intersect(HVG_first, names(which(DE_first_raw > 0.05)))
  
  save(first, first_pseudotime, HVG_first, nonDE_HVG_first, DE_first_raw, file = "output/first_data.RData")
} else {
  load("output/first_data.RData")
}

scHOT variability test for first branch

if (!file.exists("output/first_DV.RData")) {
  set.seed(500)
  W_first = weightMatrix(ncol(first), span = 0.5)
  same_pairs_first = cbind(nonDE_HVG_first, nonDE_HVG_first)
  rownames(same_pairs_first) <- nonDE_HVG_first
  first_DV_stats = DCARSacrossNetwork(first,
                                      edgelist = same_pairs_first,
                                      W = W_first,
                                      weightedConcordanceFunction = weightedVarMatrixStats,
                                      verbose = TRUE,
                                      extractTestStatisticOnly = TRUE)
  names(first_DV_stats) <- nonDE_HVG_first
  first_DV_wVars = t(DCARSacrossNetwork(first,
                                        edgelist = same_pairs_first,
                                        W = W_first,
                                        weightedConcordanceFunction = weightedVarMatrixStats,
                                        verbose = TRUE,
                                        extractWcorSequenceOnly = TRUE))
  rownames(first_DV_wVars) <- nonDE_HVG_first
  
  if (parallel) {
    split_df = split.data.frame(same_pairs_first, rep(1:ncores, length.out = nrow(same_pairs_first)))
    names(split_df) <- NULL
    first_DV_permstats_raw = mclapply(split_df, function(s) {
      res = DCARSacrossNetwork(first,
                               edgelist = s,
                               W = W_first,
                               weightedConcordanceFunction = weightedVarMatrixStats,
                               verbose = TRUE,
                               extractPermutationTestStatistics = TRUE,
                               niter = 1000)
      res = lapply(res, unlist)
      names(res) <- rownames(s)
      return(res)
    }, mc.cores = ncores)
    first_DV_permstats = unlist(first_DV_permstats_raw, recursive = FALSE)
    first_DV_permstats = first_DV_permstats[rownames(same_pairs_first)]
  } else {
    first_DV_permstats_raw = DCARSacrossNetwork(first,
                                                edgelist = same_pairs_first,
                                                W = W_first,
                                                weightedConcordanceFunction = weightedVarMatrixStats,
                                                verbose = TRUE,
                                                extractPermutationTestStatistics = TRUE,
                                                niter = 1000)
    first_DV_permstats = lapply(first_DV_permstats_raw, unlist)
    names(first_DV_permstats) <- nonDE_HVG_first
  }
  
  first_DV_pvals = sapply(1:length(first_DV_stats), function(i){
    mean(first_DV_permstats[[i]] >= first_DV_stats[i])
  })
  names(first_DV_pvals) <- nonDE_HVG_first
  first_DV_fdr = p.adjust(first_DV_pvals, method = "BH")
  
  save(W_first, same_pairs_first, first_DV_stats, first_DV_wVars, first_DV_permstats, first_DV_pvals, first_DV_fdr, file = "output/first_DV.RData")
} else {
  load("output/first_DV.RData")
}

DV_FDR_level = 0.1

first_sig = names(which(sort(first_DV_fdr)<DV_FDR_level))
dim(first_sig)
## NULL
matplot(t(first_DV_wVars[first_sig, ]), type = "l")

# changes all appear monotonic

# class genes into gain or loss or variability
sig_class = ifelse(first_DV_wVars[first_sig, ncol(first_DV_wVars)] > first_DV_wVars[first_sig, 1],
                   "gain","loss")
table(sig_class)
## sig_class
## gain loss 
##   58   10
sort(sig_class)
##    Asf1b    Birc5    Cadm1    Ccnb2    Cdca3     Eno1    H2afz    Hmgb1 
##   "gain"   "gain"   "gain"   "gain"   "gain"   "gain"   "gain"   "gain" 
##    Hmgb2     Hprt   Mad2l1    Mat2a   Mthfd2     Nasp     Nmt2     Pfkl 
##   "gain"   "gain"   "gain"   "gain"   "gain"   "gain"   "gain"   "gain" 
##     Sfpq    Snhg4   Trim59     Ybx1    Cdca8   Maged1     Sypl    Ccna2 
##   "gain"   "gain"   "gain"   "gain"   "gain"   "gain"   "gain"   "gain" 
##   Metap2    Pnrc2     Rpa1     Smc2    Fubp1     Mcm3    Mier3    Pola2 
##   "gain"   "gain"   "gain"   "gain"   "gain"   "gain"   "gain"   "gain" 
##    Ptcd3 Serpinf2      Vtn     Clk1    Trp53    Hmga2    Tacc3     Tpi1 
##   "gain"   "gain"   "gain"   "gain"   "gain"   "gain"   "gain"   "gain" 
##   Zfp260     Ect2    Haus3    Hsph1    Nop56      Wrb    Cenpa   Nap1l1 
##   "gain"   "gain"   "gain"   "gain"   "gain"   "gain"   "gain"   "gain" 
##     Rfc3    Spcs3   Ncapd2     Rcn3   Shcbp1    Lect2   Zfp871  Tnfaip8 
##   "gain"   "gain"   "gain"   "gain"   "gain"   "gain"   "gain"   "gain" 
##     Cd63    Aurka  Creb3l3    Dmgdh   Hmgcs2    Gimd1 Rpl3-ps1   Tmem37 
##   "gain"   "gain"   "loss"   "loss"   "loss"   "loss"   "loss"   "loss" 
##     Insr     G0s2     Nab1    Fbxo8 
##   "loss"   "loss"   "loss"   "loss"
matplot(t(first_DV_wVars[names(which(sig_class == "gain")), ]), type = "l", 
        col = "black", lty = 1)

matplot(t(first_DV_wVars[names(which(sig_class == "loss")), ]), type = "l", 
        col = "black", lty = 1)

genesOfInterest = c("Birc5","H2afz","Tacc3", "Hmgcs2")
gList = sapply(genesOfInterest, function(i) {
  require(patchwork)
  df = data.frame(
    rank = 1:ncol(first),
    pseudotime = first_pseudotime,
    expr = first[i,],
    mean = apply(W_first, 1, function(w) weightedMean(first[i,], w)),
    var = first_DV_wVars[i,],
    sd = sqrt(first_DV_wVars[i,])
  )
  df$lower = df$mean - df$sd
  df$upper = df$mean + df$sd
  
  g1 = ggplot(df, aes(x = pseudotime, y = expr)) + 
    geom_point() +
    geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.3, colour = NA) +
    geom_line(aes(y = mean)) + 
    theme_minimal() +
    theme(legend.position = "none") +
    theme(panel.grid = element_blank()) +
    ggtitle(ifelse(is.numeric(i),nonDE_HVG_first[i], i))  +
    theme(axis.title = element_text(size = 15)) +
    theme(plot.title = element_text(face = "italic")) +
    theme(axis.line.x = element_line(color="black", size = 0.5),
          axis.line.y = element_line(color="black", size = 0.5)) +
    xlab(ifelse(i == genesOfInterest[1],"Pseudotime","")) +
    ylab(ifelse(i == genesOfInterest[1],"Expression","")) +
    NULL
  
  g2 = DCARS::plotOrderedExpression(list(Hepatocye = liver_branch_hep, Cholangiocyte = liver_branch_chol),
                                    gene = i, 
                                    xvals = list(Hepatocye = liver_pseudotime_hep, Cholangiocyte = liver_pseudotime_chol)) + 
    xlab("Pseudotime") + 
    ylab("Expression") +
    theme(panel.grid = element_blank()) + 
    facet_grid(branch~.) + 
    theme(legend.position = "none") +
    geom_vline(xintercept = max(first_pseudotime), linetype = "dashed", colour = "darkgrey", size = 2) +
    NULL
  
  g1 + g2 + plot_layout(ncol = 1, heights = c(1,2))
  
  return(g1)
}, simplify = FALSE)
g_first_DV_examples = wrap_plots(gList, nrow = 1)
g_first_DV_examples

ggsave(g_first_DV_examples, file = "output/first_DV_gain_ribbon_examples.pdf",
       height = 3, width = 10)

df_gain_raw = sapply(names(which(sig_class == "gain")), function(i){
  df = data.frame(
    rank = 1:ncol(first),
    pseudotime = first_pseudotime,
    expr = first[i,],
    mean = apply(W_first, 1, function(w) weightedMean(first[i,], w)),
    var = first_DV_wVars[i,],
    sd = sqrt(first_DV_wVars[i,])
  )
  df$lower = df$mean - df$sd
  df$upper = df$mean + df$sd
  df$gene = i
  return(df)
}, simplify = FALSE)
df_gain = do.call(rbind, df_gain_raw)

g = ggplot(df_gain, aes(x = pseudotime, y = expr, group = gene)) + 
  geom_point(alpha = 0.5, size = 0.5) +
  geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.3, colour = NA) +
  theme_classic() +
  facet_wrap(gene~., scales = "free") +
  theme(legend.position = "none") +
  theme(panel.grid = element_blank()) +
  xlab("Pseudotime") + 
  ylab("Expression") +
  theme(axis.line.x = element_line(color="black", size = 0.5),
        axis.line.y = element_line(color="black", size = 0.5)) +
  theme(strip.text = element_text(face = "italic")) +
  theme(axis.text.x = element_text(size = 1)) +
  theme(axis.ticks.x = element_blank()) +
  ggtitle("Genes gaining variability in first branch of hepatoblasts") +
  NULL
g

ggsave(g, file = "output/first_DV_gain_ribbon.pdf", height = 8, width = 10)

df_loss_raw = sapply(names(which(sig_class == "loss")), function(i){
  df = data.frame(
    rank = 1:ncol(first),
    pseudotime = first_pseudotime,
    expr = first[i,],
    mean = apply(W_first, 1, function(w) weightedMean(first[i,], w)),
    var = first_DV_wVars[i,],
    sd = sqrt(first_DV_wVars[i,])
  )
  df$lower = df$mean - df$sd
  df$upper = df$mean + df$sd
  df$gene = i
  return(df)
}, simplify = FALSE)
df_loss = do.call(rbind, df_loss_raw)

g = ggplot(df_loss, aes(x = pseudotime, y = expr, group = gene)) + 
  geom_point(alpha = 0.5) +
  geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.3, colour = NA) +
  theme_classic() +
  facet_wrap(gene~., scales = "free", nrow = 2) +
  theme(legend.position = "none") +
  theme(panel.grid = element_blank()) +
  theme(strip.text = element_text(face = "italic")) +
  theme(axis.text.x = element_text(size = 1)) +
  theme(axis.ticks.x = element_blank()) +
  xlab("Pseudotime") + 
  ylab("Expression") +
  ggtitle("Genes losing variability in first branch of hepatoblasts") +
  NULL
g

ggsave(g, file = "output/first_DV_loss_ribbon.pdf", height = 5, width = 10)

Print this out for significantly differentially variable genes.

first_DV_all = data.frame(
  gene = names(first_DV_stats),
  stat = first_DV_stats,
  # globalVar = ,
  pval = first_DV_pvals,
  fdr = first_DV_fdr,
  sig_class = sig_class[names(first_DV_stats)],
  startVar = first_DV_wVars[,1],
  endVar = first_DV_wVars[,ncol(first_DV_wVars)] 
)

first_DV_all_sorted = reshape::sort_df(first_DV_all, "fdr")
write.table(first_DV_all_sorted, file = "output/first_DV_all_sorted.tsv", sep = "\t",
            row.names = FALSE, col.names = TRUE, quote = FALSE)
write.table(subset(first_DV_all_sorted, fdr < DV_FDR_level), file = "output/first_DV_all_sorted_sig.tsv", sep = "\t",
            row.names = FALSE, col.names = TRUE, quote = FALSE)

Comparison with two sample differential variability test

# Divide data into half/half and perform a normal-distribution based
# F-test, and compare against scHOT testing

var.test.split = 0.5
var.test.split.inds = 1:ncol(first) %in% c(1:floor(var.test.split*ncol(first)))
# var.test.i = 1

var.test.pvals = sapply(1:length(first_DV_pvals), function(var.test.i) {
  var.test(first[names(first_DV_pvals)[var.test.i],var.test.split.inds],
           first[names(first_DV_pvals)[var.test.i],!var.test.split.inds])$p.value
})
names(var.test.pvals) <- names(first_DV_pvals)

var.test.fdr = p.adjust(var.test.pvals, method = "BH")

table(var.test.fdr < DV_FDR_level)
## 
## FALSE  TRUE 
##   281   219
table(first_DV_fdr < DV_FDR_level)
## 
## FALSE  TRUE 
##   432    68
addmargins(table(var.test.fdr < DV_FDR_level, first_DV_fdr < DV_FDR_level))
##        
##         FALSE TRUE Sum
##   FALSE   279    2 281
##   TRUE    153   66 219
##   Sum     432   68 500
plot(-log10(first_DV_pvals + 0.5*min(first_DV_pvals[first_DV_pvals != 0])), -log10(var.test.pvals), type = "n")
text(-log10(first_DV_pvals + 0.5*min(first_DV_pvals[first_DV_pvals != 0])), -log10(var.test.pvals), labels = names(first_DV_pvals))
abline(c(0,1), lty = 2)

topNot = names(sort(var.test.fdr[first_DV_fdr >= DV_FDR_level]))[1:20]

# outlierGenes = c("Tceal8", "Rian", "Rsl1d1", "Mttp", "Meg3", "Nab1", "Shcbp1")
gList = sapply(topNot, function(gene) {
  qplot(1:ncol(first), first[gene,]) + 
    xlab("Pseudotime Ranking") +
    ylab("Expression") + 
    ggtitle(gene) +
    theme_classic() +
    theme(plot.title = element_text(face = "italic")) +
    geom_vline(xintercept = max(which(var.test.split.inds)),linetype = "dashed") + 
    # geom_text(aes(x = x, y = y, label = text), 
    #           data = data.frame(x = max(which(var.test.split.inds)),
    #                             y = 0,
    #                             text = "Trajectory \n middle point"),
    #           inherit.aes = FALSE) +
    NULL
}, simplify = FALSE)
g = wrap_plots(gList)
g

ggsave(g, file = "output/first_DV_topNot.pdf",height = 10, width = 14)
# a number of genes are driven by outlier expression (e.g. Tceal8, Rsl1d1)

# intersection
names(var.test.fdr)[var.test.fdr < DV_FDR_level & first_DV_fdr < DV_FDR_level]
##  [1] "Asf1b"    "Aurka"    "Birc5"    "Cadm1"    "Ccna2"    "Ccnb2"   
##  [7] "Cd63"     "Cdca3"    "Cdca8"    "Cenpa"    "Clk1"     "Creb3l3" 
## [13] "Dmgdh"    "Ect2"     "Eno1"     "Fbxo8"    "Fubp1"    "G0s2"    
## [19] "Gimd1"    "H2afz"    "Haus3"    "Hmga2"    "Hmgb1"    "Hmgb2"   
## [25] "Hmgcs2"   "Hprt"     "Hsph1"    "Insr"     "Lect2"    "Mad2l1"  
## [31] "Maged1"   "Mat2a"    "Mcm3"     "Metap2"   "Mier3"    "Mthfd2"  
## [37] "Nap1l1"   "Nasp"     "Ncapd2"   "Nmt2"     "Nop56"    "Pfkl"    
## [43] "Pnrc2"    "Pola2"    "Ptcd3"    "Rcn3"     "Rfc3"     "Rpa1"    
## [49] "Rpl3-ps1" "Serpinf2" "Sfpq"     "Smc2"     "Snhg4"    "Spcs3"   
## [55] "Sypl"     "Tacc3"    "Tmem37"   "Tnfaip8"  "Tpi1"     "Trim59"  
## [61] "Trp53"    "Vtn"      "Wrb"      "Ybx1"     "Zfp260"   "Zfp871"
# scHOT but not variance
names(var.test.fdr)[var.test.fdr >= DV_FDR_level & first_DV_fdr < DV_FDR_level]
## [1] "Nab1"   "Shcbp1"
# variance but not scHOT
names(var.test.fdr)[var.test.fdr < DV_FDR_level & first_DV_fdr >= DV_FDR_level]
##   [1] "4933434E20Rik" "Aacs"          "Aass"          "Abcg2"        
##   [5] "Actb"          "Actg1"         "Ado"           "Afm"          
##   [9] "Afp"           "Ambp"          "Apoa2"         "Apom"         
##  [13] "Arhgap5"       "Asah1"         "Ass1"          "Atad2"        
##  [17] "Atp6v1b2"      "Atp7a"         "Atrx"          "Bckdk"        
##  [21] "Bex1"          "Bex2"          "Bex4"          "Bsg"          
##  [25] "C1s1"          "Calm1"         "Casp8ap2"      "Ccdc50"       
##  [29] "Ccnb1"         "Ccnd1"         "Cd59a"         "Cdc45"        
##  [33] "Cdca7"         "Ckap5"         "Cks1b"         "Cltc"         
##  [37] "Cript"         "Ctsb"          "Ctsd"          "Cxcl12"       
##  [41] "Ddah1"         "Ddx52"         "Dhrs3"         "Dscr3"        
##  [45] "Eif4ebp2"      "Eno1b"         "F11r"          "F2r"          
##  [49] "Fah"           "Fgg"           "Fign"          "Fignl1"       
##  [53] "Gckr"          "Ghr"           "Gpc3"          "Grb10"        
##  [57] "Gstm4"         "Gtf2a1"        "H3f3b"         "Hat1"         
##  [61] "Hhex"          "Hsd17b4"       "Igfbp1"        "Il1r1"        
##  [65] "Ivns1abp"      "Kcnq1ot1"      "Klf13"         "Lifr"         
##  [69] "Lig1"          "Lrrc58"        "Lrwd1"         "Maged2"       
##  [73] "Maob"          "Mcm4"          "Mcm5"          "Mcm7"         
##  [77] "Meg3"          "Mif"           "Mllt10"        "Mtfr1"        
##  [81] "Mttp"          "Myh10"         "Nek7"          "Nmd3"         
##  [85] "Otud4"         "Pdk1"          "Pecr"          "Peg3"         
##  [89] "Pgk1"          "Pgm2"          "Phgdh"         "Poldip3"      
##  [93] "Ppp1cb"        "Ppp2r5c"       "Ppp6c"         "Ppt1"         
##  [97] "Prdx4"         "Prim1"         "Psmf1"         "Pttg1ip"      
## [101] "Pum2"          "R3hdm1"        "Rabif"         "Rad21"        
## [105] "Rbm47"         "Rbm5"          "Rbp4"          "Rfc2"         
## [109] "Rian"          "Rnmt"          "Rpl36al"       "Rps4l"        
## [113] "Rrm1"          "Rrm2"          "Rsl1d1"        "Sdc4"         
## [117] "Sept11"        "Serpina1b"     "Serpina1d"     "Serpind1"     
## [121] "Serpinf1"      "Sgms1"         "Sh3bgrl"       "Sigirr"       
## [125] "Slbp"          "Snx5"          "Spc25"         "Spin1"        
## [129] "Spint2"        "Srsf1"         "Tars"          "Tceal8"       
## [133] "Tipin"         "Tmpo"          "Tpm4"          "Tpp2"         
## [137] "Trf"           "Trim71"        "Tspan12"       "Ttc4"         
## [141] "Ttr"           "Tuba1a"        "Tuba1c"        "Tubb4b"       
## [145] "Tubb5"         "Tubb6"         "Ubl3"          "Vars"         
## [149] "Vcam1"         "Wdfy3"         "Zc3hav1"       "Zfp644"       
## [153] "Zmynd8"

Comparison of variability along full hepatocyte trajectory

var.test.split.inds = (1:ncol(liver_branch_hep)) %in% c(1:ncol(first))

var.test.pvals = sapply(names(hep_DV_pvals), function(gene) {
  var.test(liver_branch_hep[gene,var.test.split.inds],
           liver_branch_hep[gene,!var.test.split.inds])$p.value
})
# names(var.test.pvals) <- names(hep_DV_pvals)
var.test.fdr = p.adjust(var.test.pvals, method = "BH")

gene = "Cdca8"
plot(hep_DV_wVars[gene,])

var.test.fdr[gene]
##     Cdca8 
## 0.8618713
g = ggplot(data.frame(index = 1:ncol(liver_branch_hep),
                      val = liver_branch_hep[gene,],
                      upper = mean(liver_branch_hep[gene,]) + hep_DV_wVars[gene,],
                      lower = mean(liver_branch_hep[gene,]) - hep_DV_wVars[gene,]),
           aes(x = index, y = val)) +
  geom_point() + 
  xlab("Pseudotime Ranking") + 
  ylab("Expression") +
  ggtitle(gene) +
  geom_ribbon(aes(x = index, ymin = lower, ymax = upper), alpha = 0.5) + 
  geom_vline(xintercept = max(which(var.test.split.inds)), linetype = "dotted") +
  # geom_vline(xintercept = ncol(first)) +
  theme_classic() + 
  geom_text(aes(label = text), data = data.frame(index = ncol(first),
                                                 val = 11,
                                                 text = "Trajectory \n Decision point"),
            inherit.aes = TRUE) +
  theme(plot.title = element_text(face = "italic")) +
  NULL
g

ggsave(g, file = paste0("output/hep_DV_", gene, ".pdf"),height = 4, width = 5)

GO testing for scHOT variable genes in first branch

if (!file.exists("output/GO_list.Rds")) {
  library(GO.db)
  library(org.Mm.eg.db)
  keys = keys(org.Mm.eg.db)
  columns(org.Mm.eg.db)
  GO_info = AnnotationDbi::select(org.Mm.eg.db, keys=keys, columns = c("SYMBOL", "GO"))
  
  keep = GO_info$SYMBOL %in% rownames(liver)
  table(keep)
  GO_info_filt = GO_info[keep,]
  
  # at least 10 genes in the term and less than 500
  allTerms = names(which(table(GO_info_filt$GO) >= 10 & table(GO_info_filt$GO) <= 500))
  
  GO_info_terms = AnnotationDbi::select(GO.db, columns = columns(GO.db), keys = allTerms)
  rownames(GO_info_terms) <- GO_info_terms$GOID
  
  allTermNames = GO_info_terms[allTerms, "TERM"]
  names(allTermNames) <- allTerms
  
  GO_list = sapply(allTerms, function(term) {
    print(term)
    genes = GO_info_filt[GO_info_filt$GO == term, "SYMBOL"]
    return(sort(unique(genes[!is.na(genes)])))
  }, simplify = FALSE)
  names(GO_list) <- allTermNames
  
  saveRDS(GO_list, file = "output/GO_list.Rds")
} else {
  GO_list = readRDS("output/GO_list.Rds")
}

GO_list_DV_first = GO_list[unlist(lapply(GO_list, function(x) any(x %in% same_pairs_first[,1])))]

first_DV_genes = list(gain = names(which(sig_class == "gain")),
                      loss = names(which(sig_class == "loss")))

first_DV_genes_GO = lapply(first_DV_genes, function(set){
  genesetGOtest(set, rownames(liver), GO_list_DV_first)
})

gList = lapply(first_DV_genes_GO, function(pval) {
  df = data.frame(term = factor(names(pval), levels = c(names(pval), "")),
                  pval = pval,
                  qval = p.adjust(pval, method = "BH"))
  df$label = df$term
  df$label[pval != 1] <- ""
  df_sorted = sort_df(df, "pval")[1:10,]
  df_sorted$term = factor(df_sorted$term, levels =  rev(df_sorted$term))
  
  g = ggplot(df_sorted, aes(x = term, y = -log10(pval), fill = qval < 0.05)) +
    theme_classic() +
    geom_col() +
    coord_flip() +
    xlab("") +
    ylab(expression("-log10(P-value)")) +
    # geom_hline(yintercept = -log10(0.01), colour = "red", linetype = "dashed", size = 1.2) +
    scale_fill_manual(values = c("TRUE" = "dimgrey", "FALSE" = "peachpuff")) +
    theme(legend.position = "none") +
    theme(axis.text.y = element_text(size = 10)) +
    theme(title = element_text(hjust = 0.5)) +
    scale_x_discrete(labels = function(x) str_wrap(x, width = 30)) +
    NULL
  return(g)
  
})
gList[[1]] <- gList[[1]] + ggtitle("Gain of variability")
gList[[2]] <- gList[[2]] + ggtitle("Loss of variability")
gList_first_DV = patchwork::wrap_plots(gList, nrow = 1)
ggsave(gList_first_DV, file = "output/first_DV_GO.pdf",height = 4.5, width = 9)
ggsave(gList[[1]], file = "output/first_DV_gain_GO.pdf",height = 4.5, width = 4.5)
ggsave(gList[[2]], file = "output/first_DV_loss_GO.pdf",height = 4.5, width = 4.5)

Compare DV of the first branch variability of hepatocyte and cholangiocyte branches

sig_gain = c(names(which(sig_class == "gain")), names(which(sig_class == "loss")))

chol_DV_wVars_sig_gain = t(DCARSacrossNetwork(liver_branch_chol,
                                              edgelist = cbind(sig_gain,sig_gain),
                                              W = W_chol,
                                              weightedConcordanceFunction = weightedVarMatrixStats,
                                              verbose = TRUE,
                                              extractWcorSequenceOnly = TRUE))
## [1] "Asf1b Asf1b"
## [1] "Birc5 Birc5"
## [1] "Cadm1 Cadm1"
## [1] "Ccnb2 Ccnb2"
## [1] "Cdca3 Cdca3"
## [1] "Eno1 Eno1"
## [1] "H2afz H2afz"
## [1] "Hmgb1 Hmgb1"
## [1] "Hmgb2 Hmgb2"
## [1] "Hprt Hprt"
## [1] "Mad2l1 Mad2l1"
## [1] "Mat2a Mat2a"
## [1] "Mthfd2 Mthfd2"
## [1] "Nasp Nasp"
## [1] "Nmt2 Nmt2"
## [1] "Pfkl Pfkl"
## [1] "Sfpq Sfpq"
## [1] "Snhg4 Snhg4"
## [1] "Trim59 Trim59"
## [1] "Ybx1 Ybx1"
## [1] "Cdca8 Cdca8"
## [1] "Maged1 Maged1"
## [1] "Sypl Sypl"
## [1] "Ccna2 Ccna2"
## [1] "Metap2 Metap2"
## [1] "Pnrc2 Pnrc2"
## [1] "Rpa1 Rpa1"
## [1] "Smc2 Smc2"
## [1] "Fubp1 Fubp1"
## [1] "Mcm3 Mcm3"
## [1] "Mier3 Mier3"
## [1] "Pola2 Pola2"
## [1] "Ptcd3 Ptcd3"
## [1] "Serpinf2 Serpinf2"
## [1] "Vtn Vtn"
## [1] "Clk1 Clk1"
## [1] "Trp53 Trp53"
## [1] "Hmga2 Hmga2"
## [1] "Tacc3 Tacc3"
## [1] "Tpi1 Tpi1"
## [1] "Zfp260 Zfp260"
## [1] "Ect2 Ect2"
## [1] "Haus3 Haus3"
## [1] "Hsph1 Hsph1"
## [1] "Nop56 Nop56"
## [1] "Wrb Wrb"
## [1] "Cenpa Cenpa"
## [1] "Nap1l1 Nap1l1"
## [1] "Rfc3 Rfc3"
## [1] "Spcs3 Spcs3"
## [1] "Ncapd2 Ncapd2"
## [1] "Rcn3 Rcn3"
## [1] "Shcbp1 Shcbp1"
## [1] "Lect2 Lect2"
## [1] "Zfp871 Zfp871"
## [1] "Tnfaip8 Tnfaip8"
## [1] "Cd63 Cd63"
## [1] "Aurka Aurka"
## [1] "Creb3l3 Creb3l3"
## [1] "Dmgdh Dmgdh"
## [1] "Hmgcs2 Hmgcs2"
## [1] "Gimd1 Gimd1"
## [1] "Rpl3-ps1 Rpl3-ps1"
## [1] "Tmem37 Tmem37"
## [1] "Insr Insr"
## [1] "G0s2 G0s2"
## [1] "Nab1 Nab1"
## [1] "Fbxo8 Fbxo8"
rownames(chol_DV_wVars_sig_gain) <- sig_gain

hep_DV_wVars_sig_gain = t(DCARSacrossNetwork(liver_branch_hep,
                                             edgelist = cbind(sig_gain,sig_gain),
                                             W = W_hep,
                                             weightedConcordanceFunction = weightedVarMatrixStats,
                                             verbose = TRUE,
                                             extractWcorSequenceOnly = TRUE))
## [1] "Asf1b Asf1b"
## [1] "Birc5 Birc5"
## [1] "Cadm1 Cadm1"
## [1] "Ccnb2 Ccnb2"
## [1] "Cdca3 Cdca3"
## [1] "Eno1 Eno1"
## [1] "H2afz H2afz"
## [1] "Hmgb1 Hmgb1"
## [1] "Hmgb2 Hmgb2"
## [1] "Hprt Hprt"
## [1] "Mad2l1 Mad2l1"
## [1] "Mat2a Mat2a"
## [1] "Mthfd2 Mthfd2"
## [1] "Nasp Nasp"
## [1] "Nmt2 Nmt2"
## [1] "Pfkl Pfkl"
## [1] "Sfpq Sfpq"
## [1] "Snhg4 Snhg4"
## [1] "Trim59 Trim59"
## [1] "Ybx1 Ybx1"
## [1] "Cdca8 Cdca8"
## [1] "Maged1 Maged1"
## [1] "Sypl Sypl"
## [1] "Ccna2 Ccna2"
## [1] "Metap2 Metap2"
## [1] "Pnrc2 Pnrc2"
## [1] "Rpa1 Rpa1"
## [1] "Smc2 Smc2"
## [1] "Fubp1 Fubp1"
## [1] "Mcm3 Mcm3"
## [1] "Mier3 Mier3"
## [1] "Pola2 Pola2"
## [1] "Ptcd3 Ptcd3"
## [1] "Serpinf2 Serpinf2"
## [1] "Vtn Vtn"
## [1] "Clk1 Clk1"
## [1] "Trp53 Trp53"
## [1] "Hmga2 Hmga2"
## [1] "Tacc3 Tacc3"
## [1] "Tpi1 Tpi1"
## [1] "Zfp260 Zfp260"
## [1] "Ect2 Ect2"
## [1] "Haus3 Haus3"
## [1] "Hsph1 Hsph1"
## [1] "Nop56 Nop56"
## [1] "Wrb Wrb"
## [1] "Cenpa Cenpa"
## [1] "Nap1l1 Nap1l1"
## [1] "Rfc3 Rfc3"
## [1] "Spcs3 Spcs3"
## [1] "Ncapd2 Ncapd2"
## [1] "Rcn3 Rcn3"
## [1] "Shcbp1 Shcbp1"
## [1] "Lect2 Lect2"
## [1] "Zfp871 Zfp871"
## [1] "Tnfaip8 Tnfaip8"
## [1] "Cd63 Cd63"
## [1] "Aurka Aurka"
## [1] "Creb3l3 Creb3l3"
## [1] "Dmgdh Dmgdh"
## [1] "Hmgcs2 Hmgcs2"
## [1] "Gimd1 Gimd1"
## [1] "Rpl3-ps1 Rpl3-ps1"
## [1] "Tmem37 Tmem37"
## [1] "Insr Insr"
## [1] "G0s2 G0s2"
## [1] "Nab1 Nab1"
## [1] "Fbxo8 Fbxo8"
rownames(hep_DV_wVars_sig_gain) <- sig_gain

first_DV_wVars_sig_gain = first_DV_wVars[sig_gain,]

first_wVAR_comparison_df_chol = data.frame(
  branch = "chol",
  gene = rep(rownames(chol_DV_wVars_sig_gain), ncol(chol_DV_wVars_sig_gain)),
  position = rep(1:ncol(chol_DV_wVars_sig_gain), each = nrow(chol_DV_wVars_sig_gain)),
  wVAR = c(chol_DV_wVars_sig_gain),
  pseudotime = liver_pseudotime_chol[rep(1:ncol(chol_DV_wVars_sig_gain), each = nrow(chol_DV_wVars_sig_gain))]
)
first_wVAR_comparison_df_hep = data.frame(
  branch = "hep",
  gene = rep(rownames(hep_DV_wVars_sig_gain), ncol(hep_DV_wVars_sig_gain)),
  position = rep(1:ncol(hep_DV_wVars_sig_gain), each = nrow(hep_DV_wVars_sig_gain)),
  wVAR = c(hep_DV_wVars_sig_gain),
  pseudotime = liver_pseudotime_hep[rep(1:ncol(hep_DV_wVars_sig_gain), each = nrow(hep_DV_wVars_sig_gain))]
)
first_wVAR_comparison_df_first = data.frame(
  branch = "first",
  gene = rep(rownames(first_DV_wVars_sig_gain[sig_gain,]), ncol(first_DV_wVars_sig_gain[sig_gain,])),
  position = rep(1:ncol(first_DV_wVars_sig_gain[sig_gain,]), each = nrow(first_DV_wVars_sig_gain[sig_gain,])),
  wVAR = c(first_DV_wVars_sig_gain[sig_gain,]),
  pseudotime = liver_pseudotime_chol[rep(1:ncol(first_DV_wVars_sig_gain[sig_gain,]), each = nrow(first_DV_wVars_sig_gain[sig_gain,]))]
)
first_wVAR_comparison_df = rbind(
  first_wVAR_comparison_df_chol,
  first_wVAR_comparison_df_hep,
  first_wVAR_comparison_df_first
)
first_wVAR_comparison_df$chol_isnonDE = first_wVAR_comparison_df$gene %in% nonDE_HVG_chol
first_wVAR_comparison_df$hep_isnonDE = first_wVAR_comparison_df$gene %in% nonDE_HVG_hep

ggplot(first_wVAR_comparison_df, aes(x = pseudotime, y = wVAR, group = branch, colour = branch)) + 
  geom_line(size = 2) + 
  facet_wrap(chol_isnonDE~gene) +
  theme_minimal() +
  NULL

g1 = ggplot(subset(first_wVAR_comparison_df, chol_isnonDE & hep_isnonDE), 
            aes(x = position, y = wVAR, group = branch, colour = branch)) + 
  geom_line(size = 2) + 
  facet_wrap(~gene, scales = "free") +
  theme_minimal() +
  ggtitle("not DE in either") +
  geom_vline(xintercept = length(first_pseudotime)) +
  NULL
g1

g2 = ggplot(subset(first_wVAR_comparison_df, !chol_isnonDE & hep_isnonDE), 
            aes(x = position, y = wVAR, group = branch, colour = branch)) + 
  geom_line(size = 2) + 
  facet_wrap(~gene, scales = "free") +
  theme_minimal() +
  ggtitle("DE only in chol") +
  geom_vline(xintercept = length(first_pseudotime)) +
  NULL
g2

g3 = ggplot(subset(first_wVAR_comparison_df, chol_isnonDE & !hep_isnonDE), 
            aes(x = position, y = wVAR, group = branch, colour = branch)) + 
  geom_line(size = 2) + 
  facet_wrap(~gene, scales = "free") +
  theme_minimal() +
  ggtitle("DE only in hep") +
  geom_vline(xintercept = length(first_pseudotime)) +
  NULL
g3

g4 = ggplot(subset(first_wVAR_comparison_df, !chol_isnonDE & !hep_isnonDE), 
            aes(x = position, y = wVAR, group = branch, colour = branch)) + 
  geom_line(size = 2) + 
  facet_wrap(~gene, scales = "free") +
  theme_minimal() +
  ggtitle("DE in both hep and chol") +
  geom_vline(xintercept = length(first_pseudotime)) +
  NULL
g4

ggsave(g1, file = "output/first_wVAR_comparison_1.pdf",height = 10, width = 12)
ggsave(g2, file = "output/first_wVAR_comparison_2.pdf",height = 10, width = 12)
ggsave(g3, file = "output/first_wVAR_comparison_3.pdf",height = 10, width = 12)
ggsave(g4, file = "output/first_wVAR_comparison_4.pdf",height = 10, width = 12)

first_wVAR_comparison_df$branch = factor(first_wVAR_comparison_df$branch, levels = c("hep", "chol", "first"))

genesOfInterest = c("Birc5","H2afz","Tacc3", "Hmgcs2")
gList = sapply(genesOfInterest, function(geneOfInterest) {
  g5 = ggplot(subset(first_wVAR_comparison_df, gene %in% geneOfInterest), 
              aes(x = position, y = wVAR, group = branch, colour = branch,
                  alpha = branch, size = branch)) + 
    geom_line() + 
    theme_minimal() +
    theme(legend.position = "none") +
    ggtitle(geneOfInterest) +
    theme(panel.grid = element_blank()) +
    xlab("Pseudotime ranking") +
    ylab("Local weighted variance") +
    geom_vline(xintercept = length(first_pseudotime)) +
    scale_colour_tableau() +
    scale_alpha_manual(values = c("chol" = 0.5, "hep" = 0.5, "first" = 1)) +
    scale_size_manual(values = c("chol" = 1, "hep" = 1, "first" = 2)) +
    theme(axis.title = element_text(size = 15)) +
    theme(plot.title = element_text(face = "italic")) +
    theme(axis.line.x = element_line(color="black", size = 0.5),
          axis.line.y = element_line(color="black", size = 0.5)) +
    xlab(ifelse(geneOfInterest == genesOfInterest[1],"Pseudotime ranking","")) +
    ylab(ifelse(geneOfInterest == genesOfInterest[1],"Local weighted variance","")) +
    NULL
  return(g5)
}, simplify = FALSE)
gList_DV_lines = wrap_plots(gList, nrow = 1)
gList_DV_lines

ggsave(gList_DV_lines, file = "output/first_DV_gain_lines_examples.pdf",
       height = 3, width = 10)

Perform scHOT correlation testing for all pairs of selected genes

First calculate weighted correlation vectors

# calculate the wcors for all the ordering genes pairs
if (!file.exists("output/exprs_HVG.RData")) {
  hep_exprs_HVG = exprs(liver)[nonDE_HVG_hep, names(liver_pseudotime_hep)]
  chol_exprs_HVG = exprs(liver)[nonDE_HVG_chol, names(liver_pseudotime_chol)]
  save(hep_exprs_HVG, chol_exprs_HVG, file = "output/exprs_HVG.RData")
} else {
  load("output/exprs_HVG.RData")
}

# HVG calculated over entire trajectory cells
pairs = t(combn(sort(HVG), 2))
rownames(pairs) <- apply(pairs,1,paste0, collapse = "_")

pairs_hep_all = t(combn(sort(nonDE_HVG_hep), 2))
rownames(pairs_hep_all) <- apply(pairs_hep_all,1,paste0, collapse = "_")

pairs_chol_all = t(combn(sort(nonDE_HVG_chol), 2))
rownames(pairs_chol_all) <- apply(pairs_chol_all,1,paste0, collapse = "_")  

if (!file.exists("output/all_wcors.RData")) {
  
  pairs_hep_all = t(combn(sort(nonDE_HVG_hep), 2))
  rownames(pairs_hep_all) <- apply(pairs_hep_all,1,paste0, collapse = "_")
  
  pairs_chol_all = t(combn(sort(nonDE_HVG_chol), 2))
  rownames(pairs_chol_all) <- apply(pairs_chol_all,1,paste0, collapse = "_")  
  
  W_hep = weightMatrix(ncol(hep_exprs_HVG), span = 0.25)
  
  if (parallel) {
    pairs_hep_split = split.data.frame(pairs_hep_all, rep(1:ncores, length.out = nrow(pairs_hep_all)))
    wcors_hep_raw = mclapply(pairs_hep_split, function(pairs){
      t(DCARSacrossNetwork(hep_exprs_HVG,
                           edgelist = pairs,
                           W = W_hep,
                           extractWcorSequenceOnly = TRUE, 
                           weightedConcordanceFunction = weightedSpearman,
                           verbose = TRUE))
    }, mc.cores = ncores)
    wcors_hep = do.call(rbind, wcors_hep_raw)[rownames(pairs_hep_all),]
  } else {
    wcors_hep = t(DCARSacrossNetwork(hep_exprs_HVG,
                                     edgelist = pairs_hep_all,
                                     W = W_hep,
                                     extractWcorSequenceOnly = TRUE, 
                                     weightedConcordanceFunction = weightedSpearman,
                                     verbose = TRUE))
  }
  
  W_chol = weightMatrix(ncol(chol_exprs_HVG), span = 0.25)
  
  if (parallel) {
    pairs_chol_split = split.data.frame(pairs_chol_all, rep(1:ncores, length.out = nrow(pairs_chol_all)))
    wcors_chol_raw = mclapply(pairs_chol_split, function(pairs){
      t(DCARSacrossNetwork(chol_exprs_HVG,
                           edgelist = pairs,
                           W = W_chol,
                           extractWcorSequenceOnly = TRUE, 
                           weightedConcordanceFunction = weightedSpearman,
                           verbose = TRUE))
    }, mc.cores = ncores)
    wcors_chol = do.call(rbind, wcors_chol_raw)[rownames(pairs_chol_all),]
  } else {
    wcors_chol = t(DCARSacrossNetwork(chol_exprs_HVG,
                                      edgelist = pairs_chol_all,
                                      W = W_chol,
                                      extractWcorSequenceOnly = TRUE, 
                                      weightedConcordanceFunction = weightedSpearman,
                                      verbose = TRUE))
  }
  
  save(wcors_hep, wcors_chol, W_chol, W_hep, pairs_chol_all, pairs_hep_all, file = "output/all_wcors.RData")
  
} else {
  load("output/all_wcors.RData")
}

Calculate permutation statistics for a subset of hepatocyte genepairs

if (!file.exists("output/hep_permstats.Rdata")) {
  set.seed(500)
  hep_stats_all = apply(wcors_hep,1,sd)
  hep_globalCor_all = apply(pairs_hep_all, 1, function(x)
    cor(hep_exprs_HVG[x[1],], hep_exprs_HVG[x[2],], method = "spearman")
  )
  hep_sample = DCARS::stratifiedSample(hep_globalCor_all, length = 500)
  
  if (parallel) {
    hep_permstats_raw = mclapply(as.list(hep_sample), function(h) {
      DCARSacrossNetwork(hep_exprs_HVG,
                         edgelist = pairs_hep_all[h, , drop = FALSE],
                         W = W_hep,
                         weightedConcordanceFunction = weightedSpearman,
                         verbose = TRUE,
                         extractPermutationTestStatistics = TRUE,
                         niter = 1000)
    }, mc.cores = ncores)
    hep_permstats = lapply(hep_permstats_raw, unlist)
    names(hep_permstats) <- rownames(pairs_hep_all[hep_sample,])
  } else {
    hep_permstats_raw = DCARSacrossNetwork(hep_exprs_HVG,
                                           edgelist = pairs_hep_all[hep_sample,],
                                           W = W_hep,
                                           weightedConcordanceFunction = weightedSpearman,
                                           verbose = TRUE,
                                           extractPermutationTestStatistics = TRUE,
                                           niter = 500)
    hep_permstats = unlist(hep_permstats_raw, recursive = FALSE)
    names(hep_permstats) <- rownames(pairs_hep_all[hep_sample,])
  }
  save(hep_permstats, hep_globalCor_all, hep_stats_all, file = "output/hep_permstats.Rdata")
} else {
  load("output/hep_permstats.Rdata")
}

if (!file.exists("output/hep_p_all.Rds")) {
  hep_p_all = estimatePvaluesSpearman(hep_stats_all, 
                                      hep_globalCor_all, 
                                      hep_permstats,
                                      usenperm = FALSE,
                                      nperm = 10000,
                                      plot = TRUE,
                                      maxDist = 0.1,
                                      verbose = TRUE)
  hep_p_all$fdr <- p.adjust(hep_p_all$pval, method = "BH")
  saveRDS(hep_p_all, file = "output/hep_p_all.Rds")
} else {
  hep_p_all <- readRDS("output/hep_p_all.Rds")
}

FDR_level = 0.2

Full permutation test

Compare estimated p-values with full 10,000 permutations for all gene pairs. This takes a very long time to run, was performed on the cluster.

if (!file.exists("output/hep_full_pvals.Rds")) {
  source("hep_all_permutations.R")
}
hep_full_pvals = readRDS("output/hep_full_pvals.Rds")

plot(-log10(hep_full_pvals), -log10(hep_p_all$pval)); abline(c(0,1))

cor(-log10(hep_full_pvals), -log10(hep_p_all$pval), 
    method = "spearman", use = "p")
## [1] 0.9948061
hep_full_pvals[hep_full_pvals == 0] <- 1/10001

full_p_df = data.frame(
  gene = names(hep_full_pvals),
  pval_all = hep_full_pvals,
  pval_est = hep_p_all$pval
)
full_p_df$loess = loess(I(-log10(pval_est)) ~ I(-log10(pval_all)), data = full_p_df, span = 0.3)$fitted
full_p_df = reshape::sort_df(full_p_df, "pval_all")
g = ggplot(full_p_df, aes(x = -log10(pval_all), y = -log10(pval_est))) + 
  geom_point(colour = "darkgrey", alpha = 0.7) +
  geom_line(aes(y = loess), colour = "red") +
  theme_classic() +
  xlab("P-value (10,000 permutations each)") +
  ylab("Estimated P-value") +
  geom_vline(xintercept = 2, linetype = "dashed", colour = "grey") +
  geom_hline(yintercept = 2, linetype = "dashed", colour = "grey") +
  geom_abline(slope = 1, intercept = 0, colour = "black") +
  geom_text(x = 0.15, y = 5, label = paste0("Spearman \ncorrelation: \n ", 
                                            round(cor(-log10(hep_full_pvals), -log10(hep_p_all$pval), 
                                                      method = "spearman", use = "p"), 3))) +
  NULL
g

ggsave(g, file = "output/hep_full_pval_scatterplot.pdf",height = 6, width = 6)

fit = lm(I(-log10(pval_est)) ~ I(-log10(pval_all)), data = full_p_df)
fit$coef
##         (Intercept) I(-log10(pval_all)) 
##         0.005005454         1.001514292

Calculate test statistics using other underlying higher order functions and compare

Above was done with weighted Spearman, also perform with weightedPearson, MIC and brownian distance correlation. The latter two with a blocked weightmatrix due to implementation.

# calculate global values for each and take stratified sample of size 100 from 
# each and take the union of these
if (!file.exists("output/test_statistics_output.RData")) {
  source("test_statistics_hep.R")
}
load("output/test_statistics_global.RData")
load("output/test_statistics_output.RData")

test_statistics_p_all_all = do.call(rbind, test_statistics_p_all)
test_statistics_p_all_all$testing <- rep(names(test_statistics_p_all), 
                                         times = unlist(lapply(test_statistics_p_all, nrow)))

test_statistics_pvals = do.call(cbind,lapply(test_statistics_p_all, "[", "pval"))
colnames(test_statistics_pvals) <- names(test_statistics_p_all)

test_statistics_cor = cor(-log10(test_statistics_pvals), method = "spearman")

g = ggpairs(-log10(test_statistics_pvals),
            
            lower = list(continuous = wrap("points", alpha = 0.3, size=0.1), 
                         combo = wrap("dot", alpha = 0.4, size=0.2)),
            title = "-log10(P-value)"
)

g

ggsave(g, file = "output/test_statistics_pairs.pdf",height = 8, width = 8)

pdf("output/test_statistics_corrplot.pdf", height = 6, width = 6)
g_cor = corrplot(test_statistics_cor, order = "hclust")
dev.off()
## quartz_off_screen 
##                 2
pdf("output/test_statistics_upset.pdf", height = 5, width = 8, onefile = FALSE)
upset(data = as.data.frame(as.matrix(1*(apply(test_statistics_pvals,2,p.adjust, method = "BH") < 0.2))),
      sets = colnames(test_statistics_pvals))
dev.off()
## quartz_off_screen 
##                 2
# distribution graphs
test_statistics_distn = data.frame(
  quantile_999 = unlist(lapply(test_statistics_perms, function(x) unlist(lapply(x, quantile, probs = 0.999)))),
  quantile_90 = unlist(lapply(test_statistics_perms, function(x) unlist(lapply(x, quantile, probs = 0.90)))),
  quantile_95 = unlist(lapply(test_statistics_perms, function(x) unlist(lapply(x, quantile, probs = 0.95)))),
  quantile_99 = unlist(lapply(test_statistics_perms, function(x) unlist(lapply(x, quantile, probs = 0.99)))),
  global = unlist(lapply(test_statistics_global, "[", test_statistics_sampled_ind)),
  test_statistic = rep(names(test_statistics_global), times = unlist(lapply(test_statistics_perms, length)))
)
test_statistics_distn <- reshape::sort_df(test_statistics_distn, "global")

test_statistics_distn$quantile_999_fitted = unsplit(lapply(split.data.frame(test_statistics_distn, test_statistics_distn$test_statistic),
                                                           function(df){
                                                             loess(quantile_999 ~ global, data = df)$fitted
                                                           }), test_statistics_distn$test_statistic)

test_statistics_distn$quantile_90_fitted = unsplit(lapply(split.data.frame(test_statistics_distn, test_statistics_distn$test_statistic),
                                                          function(df){
                                                            loess(quantile_90 ~ global, data = df)$fitted
                                                          }), test_statistics_distn$test_statistic)

test_statistics_distn$quantile_95_fitted = unsplit(lapply(split.data.frame(test_statistics_distn, test_statistics_distn$test_statistic),
                                                          function(df){
                                                            loess(quantile_95 ~ global, data = df)$fitted
                                                          }), test_statistics_distn$test_statistic)

test_statistics_distn$quantile_99_fitted = unsplit(lapply(split.data.frame(test_statistics_distn, test_statistics_distn$test_statistic),
                                                          function(df){
                                                            loess(quantile_99 ~ global, data = df)$fitted
                                                          }), test_statistics_distn$test_statistic)

test_statistics_distn$test_statistic = factor(test_statistics_distn$test_statistic,
                                              levels = names(test_statistics_global))

test_statistics_nicecolours = RColorBrewer::brewer.pal(name = "Blues", 9)
names(test_statistics_nicecolours) <- c(rep("", 5), "0.90","0.95","0.99","0.999")
g = ggplot(test_statistics_distn, aes(x = global))  + 
  
  geom_point(aes(y = quantile_999, colour = "0.999"),
             alpha = 0.5) +#, colour = test_statistics_nicecolours[9]) +
  geom_line(aes(y = quantile_999_fitted), colour = test_statistics_nicecolours[9]) +
  
  geom_point(aes(y = quantile_99, colour = "0.99"),
             alpha = 0.5) + #, colour = test_statistics_nicecolours[8]) +
  geom_line(aes(y = quantile_99_fitted), colour = test_statistics_nicecolours[8]) +
  
  geom_point(aes(y = quantile_95, colour = "0.95"),
             alpha = 0.5) + #, colour = test_statistics_nicecolours[7]) +
  geom_line(aes(y = quantile_95_fitted), colour = test_statistics_nicecolours[7]) +
  
  geom_point(aes(y = quantile_90, colour = "0.90"),
             alpha = 0.5) + #, colour = test_statistics_nicecolours[6]) +
  geom_line(aes(y = quantile_90_fitted), colour = test_statistics_nicecolours[6]) +
  facet_wrap(~test_statistic, scales = "free", ncol = 2) + 
  theme_classic() + 
  xlab("Global higher order statistic value") +
  ylab("Permuted scHOT test statistic quantile") +
  scale_colour_manual(name="Quantiles",values=test_statistics_nicecolours[6:9]) +
  theme(legend.position = "bottom") +
  NULL
ggsave(g, file = "output/test_statistics_pvalEstimation.pdf",
       height = 12, width = 10)

Compare different span choices

Run on cluster

if (!file.exists("output/span_output.RData")) {
  source("span_hep.R")
}
load("output/span_output.RData")

span_p_all_all = do.call(rbind, span_p_all)
span_p_all_all$testing <- rep(names(span_p_all), 
                              times = unlist(lapply(span_p_all, nrow)))

span_pvals = do.call(cbind,lapply(span_p_all, "[", "pval"))
colnames(span_pvals) <- names(span_p_all)

span_cor = cor(-log10(span_pvals), method = "spearman")

pdf("output/span_corrplot.pdf", height = 10, width = 10)
g_cor = corrplot(span_cor, order = "original")
dev.off()
## quartz_off_screen 
##                 2
pdf("output/span_upset.pdf", height = 10, width = 16, onefile = FALSE)
upset(data = as.data.frame(as.matrix(1*(apply(span_pvals,2,p.adjust, method = "BH") < FDR_level))),
      sets = colnames(span_pvals),
      order.by = "freq",
      text.scale = 2)
dev.off()
## quartz_off_screen 
##                 2
# distribution graphs
span_distn = data.frame(
  quantile_999 = unlist(lapply(span_perms, function(x) unlist(lapply(x, quantile, probs = 0.999)))),
  quantile_90 = unlist(lapply(span_perms, function(x) unlist(lapply(x, quantile, probs = 0.90)))),
  quantile_95 = unlist(lapply(span_perms, function(x) unlist(lapply(x, quantile, probs = 0.95)))),
  quantile_99 = unlist(lapply(span_perms, function(x) unlist(lapply(x, quantile, probs = 0.99)))),
  global = unlist(lapply(span_global, "[", span_sampled_ind)),
  span = rep(names(span_perms), times = unlist(lapply(span_perms, length)))
)
span_distn <- reshape::sort_df(span_distn, "global")

span_distn$quantile_999_fitted = unsplit(lapply(split.data.frame(span_distn, span_distn$span),
                                                function(df){
                                                  loess(quantile_999 ~ global, data = df)$fitted
                                                }), span_distn$span)

span_distn$quantile_90_fitted = unsplit(lapply(split.data.frame(span_distn, span_distn$span),
                                               function(df){
                                                 loess(quantile_90 ~ global, data = df)$fitted
                                               }), span_distn$span)

span_distn$quantile_95_fitted = unsplit(lapply(split.data.frame(span_distn, span_distn$span),
                                               function(df){
                                                 loess(quantile_95 ~ global, data = df)$fitted
                                               }), span_distn$span)

span_distn$quantile_99_fitted = unsplit(lapply(split.data.frame(span_distn, span_distn$span),
                                               function(df){
                                                 loess(quantile_99 ~ global, data = df)$fitted
                                               }), span_distn$span)

span_distn$span = factor(span_distn$span,levels = names(span_perms))

span_nicecolours = RColorBrewer::brewer.pal(name = "Blues", 9)
names(span_nicecolours) <- c(rep("", 5), "0.90","0.95","0.99","0.999")
g = ggplot(span_distn, aes(x = global))  + 
  
  geom_point(aes(y = quantile_999, colour = "0.999"),
             alpha = 0.5) +#, colour = span_nicecolours[9]) +
  geom_line(aes(y = quantile_999_fitted), colour = span_nicecolours[9]) +
  
  geom_point(aes(y = quantile_99, colour = "0.99"),
             alpha = 0.5) + #, colour = span_nicecolours[8]) +
  geom_line(aes(y = quantile_99_fitted), colour = span_nicecolours[8]) +
  
  geom_point(aes(y = quantile_95, colour = "0.95"),
             alpha = 0.5) + #, colour = span_nicecolours[7]) +
  geom_line(aes(y = quantile_95_fitted), colour = span_nicecolours[7]) +
  
  geom_point(aes(y = quantile_90, colour = "0.90"),
             alpha = 0.5) + #, colour = span_nicecolours[6]) +
  geom_line(aes(y = quantile_90_fitted), colour = span_nicecolours[6]) +
  # geom_point(aes(y = quantile_99_fitted)) + 
  facet_wrap(~span, scales = "free", ncol = 4) + 
  theme_classic() + 
  xlab("Global higher order statistic value") +
  ylab("Permuted scHOT test statistic quantile") +
  scale_colour_manual(name="Quantiles",values=span_nicecolours[6:9]) +
  theme(legend.position = "bottom") +
  NULL
g

ggsave(g, file = "output/span_pvalEstimation.pdf",
       height = 12, width = 14)

# what do the big ones and small ones look like 0.35 and below, and then 0.40 and above, and both

span_selection = as.data.frame(as.matrix(1*(apply(span_pvals,2,p.adjust, method = "BH") < FDR_level)))

any_span_selected = names(which(apply(span_selection,1,any)))
low_span_selected = names(which(apply(span_selection[,1:7],1,any)))
high_span_selected = names(which(apply(span_selection[,8:ncol(span_selection)],1,any)))
both_span_selected = intersect(low_span_selected, high_span_selected)
low_span_selected_only = setdiff(low_span_selected, high_span_selected)
high_span_selected_only = setdiff(high_span_selected, low_span_selected)

matplot(t(wcors_hep[low_span_selected_only,]), type = "l", col = "black", lty = 1)

matplot(t(wcors_hep[high_span_selected_only,]), type = "l", col = "black", lty = 1)

i = 10 # diff number
wcors_hep_diff = t(apply(wcors_hep[any_span_selected,],1,function(x) loess(I(diff(x,i)) ~ I(1:(length(x) - i)))$fitted))
wcors_hep_diff_signchanges = apply(wcors_hep_diff, 1, function(x) length(unique(x < 0)))
prop.table(table(wcors_hep_diff_signchanges[low_span_selected_only]))
## 
##          1          2 
## 0.03389831 0.96610169
prop.table(table(wcors_hep_diff_signchanges[high_span_selected_only]))
## 
##         1         2 
## 0.1268116 0.8731884
matplot(t(wcors_hep_diff[low_span_selected_only,]), type = "l", col = "black", lty = 1)
matplot(t(wcors_hep_diff[high_span_selected_only,]), type = "l", col = "red", lty = 1, add = TRUE)

pdf("output/span_slopes_density.pdf",height = 8, width = 8)
plot(density(wcors_hep_diff[low_span_selected_only,]), ylim = c(0,30), col = "blue",
     main = "Density of slopes of local weighted correlation", lwd = 2)
lines(density(wcors_hep_diff[high_span_selected_only,]), col = "red", lwd = 2)
lines(density(wcors_hep_diff[any_span_selected,]), col = "black", lty = "dotted", lwd = 2)
legend("topright", legend = c("Selected with low span only", "Selected with high span only",
                              "Selected with any span"),
       col = c("blue","red","black"),
       lty = c("solid","solid","dotted"),
       lwd = 2,
       bty = "n")
dev.off()
## quartz_off_screen 
##                 2

Assess stability under random subsetting of the data

Takes a long time so was run on the cluster.

if (!file.exists("output/subset_output_weightmatch_noperms.RData")) {
  source("subset_hep_weightmatch.R")
}
load("output/subset_output_weightmatch_noperms.RData")

subset_p_all_all = do.call(rbind, subset_p_all)
subset_p_all_all$testing <- rep(names(subset_p_all), 
                                times = unlist(lapply(subset_p_all, nrow)))
subset_p_all_all$testingType = unlist(lapply(strsplit(as.character(subset_p_all_all$testing), "_"), "[", 2))

subset_pvals = do.call(cbind,lapply(subset_p_all, "[", "pval"))
colnames(subset_pvals) <- names(subset_p_all)


subset_p_all_all$fullpval = hep_p_all[as.character(subset_p_all_all$genepair),"pval"]
subset_p_all_all$fullfdr = hep_p_all[as.character(subset_p_all_all$genepair),"fdr"]
subset_p_all_all$select = rep("Neither", nrow(subset_p_all_all))
subset_p_all_all[subset_p_all_all$fdr < FDR_level &
                   subset_p_all_all$fullfdr < FDR_level,"select"] <- "Both"
subset_p_all_all[subset_p_all_all$fdr > FDR_level &
                   subset_p_all_all$fullfdr < FDR_level,"select"] <- "Full only"
subset_p_all_all[subset_p_all_all$fdr < FDR_level &
                   subset_p_all_all$fullfdr > FDR_level,"select"] <- "Subset only"

subset_p_all_all$testing <- factor(subset_p_all_all$testing,
                                   levels = rev(gtools::mixedsort(unique(as.character(subset_p_all_all$testing)))))

subset_p_all_all$testing_iter = gsub("_", "", gsub("subset_0.[0-9]0", "", subset_p_all_all$testing))

subset_p_all_all$testingType <- factor(subset_p_all_all$testingType,
                                       levels = rev(gtools::mixedsort(unique(as.character(subset_p_all_all$testingType)))))

g = ggplot(subset_p_all_all, aes(x = -log10(fullpval), y = -log10(pval))) + 
  geom_scattermore(colour = "grey", pointsize = 1.5) +
  geom_point(aes(colour = select), data = subset(subset_p_all_all, select != "Neither"),
             size = 1) + 
  geom_abline() +
  # facet_grid(testingType ~ testing, drop = TRUE) +
  facet_grid(testingType ~ testing_iter) +
  # facet_wrap( ~ testing, ncol = 10) +
  # gsub("_", "", gsub("subset_0.[0-9]0", "", "subset_0.90_1"))
  xlab("Full data -log10(p-value)") +
  ylab("Subset -log10(p-value)") +
  scale_colour_manual(values = c("Neither" = "grey",
                                 "Both" = "black",
                                 "Full only" = "blue",
                                 "Subset only" = "red")) +
  theme_classic() +
  theme(legend.position = "right") +
  guides(colour = guide_legend("FDR adjusted\nP-value < 0.2 for:")) +
  theme(strip.text.x = element_blank()) +
  NULL

ggsave(g, file = "output/subset_scatterplot.pdf", height = 8, width = 18)

pdf("output/subset_qqplot.pdf",height = 3, width = 12.5)
par(mfrow=c(1,5))
for (i in 1:50) {
  if (i %% 10 == 1) {
    plot(sort(-log10(hep_p_all$pval)), sort(-log10(subset_pvals[,i])), type = "l",
         main = paste0(gsub("subset_0.", "", names(subset_p_all)[i]), "% subset"),
         xlab = "Full data quantiles",
         ylab = "Subset data quantiles");abline(c(0,1), col = "red")
  } else {
    points(sort(-log10(hep_p_all$pval)), sort(-log10(subset_pvals[,i])), type = "l")  
  }
}
dev.off()
## quartz_off_screen 
##                 2
# calculate inclusion frequency for each random subset type
subset_qvals <- apply(subset_pvals,2,p.adjust, method = "BH")
inclusion_frequency = t(apply(subset_qvals, 1, function(x) tapply(x, substring(names(subset_p_all),1,11), function(y)mean(y < FDR_level))))

subset_inclusion_frequency_df = data.frame(
  inclusion_frequency,
  full_pval = hep_p_all$pval,
  full_fdr = hep_p_all$fdr
)

gList = sapply(unique(substring(names(subset_p_all),1,11)), function(sub) {
  ggplot(subset_inclusion_frequency_df, aes(y = get(sub), x = -log10(full_pval))) + 
    geom_point(aes(colour = full_fdr < FDR_level), show.legend = sub == "subset_0.50", alpha = 0.5) + 
    geom_smooth(colour = "black") +
    scale_colour_manual(values = c("TRUE" = "red", "FALSE" = "grey")) +
    theme_classic() + 
    xlab(ifelse(sub == "subset_0.90", "-log10(P-value) from full dataset", "")) +
    ylab(ifelse(sub == "subset_0.90", "Proportion of selection at\nFDR adjusted P-value < 0.2 in subset", "")) + 
    ylim(c(0,1)) +
    ggtitle(paste0(gsub("subset_0.","", sub), "% subset")) +
    guides(colour = guide_legend("FDR adjusted\nP-value < 0.2")) +
    NULL
}, simplify = FALSE)

g = wrap_plots(gList, ncol = 5)
ggsave(g, file = "output/subset_inclusion_frequency.pdf", height = 4, width = 16)





subset_cor = cor(-log10(cbind(all = hep_p_all$pval, subset_pvals)), method = "spearman")
colnames(subset_cor) <- unlist(lapply(strsplit(gsub("subset_", "", colnames(subset_cor)), "_"), "[",1))
colnames(subset_cor)[!(1:ncol(subset_cor) %in% c(1, 6, 16, 26, 36, 46))] <- ""
rownames(subset_cor) <- colnames(subset_cor)



pdf("output/subset_corrplot.pdf", height = 5, width = 5)
g_cor = corrplot(subset_cor, 
                 order = "original",
                 tl.srt = 45)
dev.off()
## quartz_off_screen 
##                 2
subset_cor_with_all = data.frame(
  cor = subset_cor[1,],
  testingType = paste0(c("100", 100*as.numeric(unlist(lapply(strsplit(colnames(subset_pvals), "_"), "[", 2)))), "%"),
  testing = c("All", colnames(subset_pvals))
)
subset_cor_with_all$testing <- factor(subset_cor_with_all$testing, levels = 
                                        c("All", colnames(subset_pvals)))
subset_cor_with_all$testingType <- factor(subset_cor_with_all$testingType,
                                          levels = rev(gtools::mixedsort(as.character(unique(subset_cor_with_all$testingType)))))

g1 = ggplot(subset_cor_with_all, aes(x = testing, y = cor)) + 
  geom_col(aes(fill = testingType)) + 
  theme_classic() + 
  scale_fill_viridis_d() +
  ylab("Correlation of -log10(p-value) with full dataset") +
  xlab("Subset percentage scenario") +
  theme(axis.text.x = element_text(angle = 90)) +
  guides(fill = guide_legend("Subset\npercentage")) +
  NULL
g1

ggsave(g1,file = "output/subset_correlations.pdf",height = 6,width = 10)

Compare with GAM approach on derived weighted higher order statistic vectors

library(mgcv)  # load the package

# randomly subset
wcors_subs = replicate(50, {
  sub_ind = sort(sample(1:ncol(hep_exprs_HVG), ceiling(0.6*ncol(hep_exprs_HVG))))
  DCARS(hep_exprs_HVG[,sub_ind],
        xname = pairs_hep_all["Cdt1_Top2a",1],
        yname = pairs_hep_all["Cdt1_Top2a",2],
        W = W_hep[sub_ind,sub_ind],
        weightedConcordanceFunction = weightedSpearman,
        extractWcorSequenceOnly = TRUE)
})

gam_rank = rep(1:nrow(wcors_subs), 50)
gam_response = c(wcors_subs)
gam_fit = gam(gam_response ~ s(gam_rank))

# plot(gam_rank, gam_response)
plot(gam_fit)
points(gam_rank, gam_response, pch = 16, cex = 0.1)

summary(gam_fit)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## gam_response ~ s(gam_rank)
## 
## Parametric coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.1505865  0.0006213  -242.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##               edf Ref.df     F p-value    
## s(gam_rank) 8.756  8.983 12470  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.901   Deviance explained = 90.2%
## GCV = 0.0047325  Scale est. = 0.0047288  n = 12250

Calculate permutation statistics for a subset of cholangiocyte genepairs

if (!file.exists("output/chol_permstats.Rdata")) {
  set.seed(500)
  chol_stats_all = apply(wcors_chol,1,sd)
  chol_globalCor_all = apply(pairs_chol_all, 1, function(x)
    cor(chol_exprs_HVG[x[1],], chol_exprs_HVG[x[2],], method = "spearman")
  )
  chol_sample = DCARS::stratifiedSample(chol_globalCor_all, length = 500)
  
  if (parallel) {
    chol_permstats_raw = mclapply(as.list(chol_sample), function(h) {
      DCARSacrossNetwork(chol_exprs_HVG,
                         edgelist = pairs_chol_all[h, , drop = FALSE],
                         W = W_chol,
                         weightedConcordanceFunction = weightedSpearman,
                         verbose = TRUE,
                         extractPermutationTestStatistics = TRUE,
                         niter = 1000)
    }, mc.cores = ncores)
    chol_permstats = lapply(chol_permstats_raw, unlist)
    names(chol_permstats) <- rownames(pairs_chol_all[chol_sample,])
  } else {
    chol_permstats_raw = DCARSacrossNetwork(chol_exprs_HVG,
                                            edgelist = pairs_chol_all[chol_sample,],
                                            W = W_chol,
                                            weightedConcordanceFunction = weightedSpearman,
                                            verbose = TRUE,
                                            extractPermutationTestStatistics = TRUE,
                                            niter = 500)
    chol_permstats = unlist(chol_permstats_raw, recursive = FALSE)
    names(chol_permstats) <- rownames(pairs_chol_all[chol_sample,])
  }
  save(chol_permstats, chol_globalCor_all, chol_stats_all, file = "output/chol_permstats.Rdata")
} else {
  load("output/chol_permstats.Rdata")
}

if (!file.exists("output/chol_p_all.Rds")) {
  chol_p_all = estimatePvaluesSpearman(chol_stats_all, 
                                       chol_globalCor_all, 
                                       chol_permstats,
                                       usenperm = FALSE,
                                       nperm = 10000,
                                       plot = TRUE,
                                       maxDist = 0.1,
                                       verbose = TRUE)
  chol_p_all$fdr <- p.adjust(chol_p_all$pval, method = "BH")
  saveRDS(chol_p_all, file = "output/chol_p_all.Rds")
} else {
  chol_p_all <- readRDS("output/chol_p_all.Rds")
}

Graph null distributions and global correlations

globalCors_hep = hep_p_all$globalCor
names(globalCors_hep) <- rownames(hep_p_all)

globalCors_chol = chol_p_all$globalCor
names(globalCors_chol) <- rownames(chol_p_all)

permstatsDF_hep = data.frame(
  branch = "hep",
  genepair = rep(names(hep_permstats), 
                 times = unlist(lapply(hep_permstats, function(x) length(unlist(x))))), 
  stat = unlist(hep_permstats))

permstatsDF_hep$globalCor = globalCors_hep[as.character(permstatsDF_hep$genepair)]

df_99_hep = data.frame(
  branch = "hep",
  globalCor = globalCors_hep[names(hep_permstats)], stat =  unlist(lapply(hep_permstats, quantile, 0.99)))
df_99_hep$fitted = loess(stat ~ globalCor, data = df_99_hep)$fitted

permstatsDF_chol = data.frame(
  branch = "chol",
  genepair = rep(names(chol_permstats), times = unlist(lapply(chol_permstats, function(x) length(unlist(x))))), 
  stat = unlist(chol_permstats))
permstatsDF_chol$globalCor = globalCors_chol[as.character(permstatsDF_chol$genepair)]

df_99_chol = data.frame(
  branch = "chol",
  globalCor = globalCors_chol[names(chol_permstats)], stat =  unlist(lapply(chol_permstats, quantile, 0.99)))
df_99_chol$fitted = loess(stat ~ globalCor, data = df_99_chol)$fitted

permstatsDF = rbind(permstatsDF_hep,
                    permstatsDF_chol)

df_99 = rbind(df_99_hep,
              df_99_chol)

g = ggplot(permstatsDF, aes(x = globalCor, y = stat, colour = branch)) + 
  geom_scattermore() +
  geom_point(aes(group = branch, colour = branch), data = df_99) + 
  geom_line(aes(y = fitted, group = branch, colour = branch), data = df_99) +
  theme_classic() + 
  ylab("Permuted test statistic") +
  xlab("Global correlation") +
  theme(panel.grid = element_blank())
g

ggsave(g, file = "output/stats_globalCor.pdf", height = 8, width = 8)

Interrogate significant genepairs for hepatocyte branch

pairs_hep_sig = pairs_hep_all[rownames(subset(hep_p_all, fdr < FDR_level)),]
wcors_hep_sig = wcors_hep[rownames(pairs_hep_sig),]

dim(pairs_hep_sig)
## [1] 224   2
length(unique(c(pairs_hep_sig)))
## [1] 136
wcors_hep_sig_smooth = t(apply(wcors_hep_sig,1,function(x){
  loess(x ~ I(1:length(x)))$fitted
}))

hc = hclust(dist(wcors_hep_sig_smooth, method = "maximum"), method = "complete") #ward.D2

wcors_hep_sig_clusterGenes = cutreeDynamic(
  hc, 
  minClusterSize = 10, 
  method = "tree",
  deepSplit = TRUE,
  useMedoids = TRUE
)
k_hep = length(unique(wcors_hep_sig_clusterGenes))
k_hep
## [1] 10
wcors_hep_sig_clusterGenes = cutree(hc, k = k_hep-1)
names(wcors_hep_sig_clusterGenes) <- hc$labels

saveRDS(wcors_hep_sig_clusterGenes, file = "output/wcors_hep_sig_clusterGenes.Rds")

hep_sig_table = cbind(hep_p_all[rownames(pairs_hep_sig),], cluster = wcors_hep_sig_clusterGenes)

wcors_hep_sig_clusterGenes_list = lapply(split(names(wcors_hep_sig_clusterGenes), wcors_hep_sig_clusterGenes), function(h) sort(unique(c(pairs_hep_all[h,]))))
length(wcors_hep_sig_clusterGenes_list)
## [1] 9
wcors_hep_sig_mean = apply(wcors_hep_sig, 2, function(x){
  tapply(x,wcors_hep_sig_clusterGenes, mean)
})
sigOrder = rev(paste0("Cluster ",rownames(wcors_hep_sig_mean)[heatmap.2(wcors_hep_sig_mean)$rowInd]))

pdf("output/clustered_wcor_hep.pdf", height = 10, width = 8)
heatmap.2(wcors_hep_sig_mean, trace = "n", Colv = FALSE, 
          col = colorRampPalette(c("blue","white","red"))(100),
          margins = c(8,8),
          main = "Hepatocyte")
dev.off()
## quartz_off_screen 
##                 2
wcors_hep_sigLong = reshape::melt(t(wcors_hep_sig))
wcors_hep_sigLong$cluster = paste0("Cluster ", wcors_hep_sig_clusterGenes[as.character(wcors_hep_sigLong$X2)])
wcors_hep_sigLong$cluster <- factor(wcors_hep_sigLong$cluster, 
                                    levels = sigOrder
)
gh = geom_hline(yintercept = 0, linetype = "dotted", colour = "red", size = 1)
g = ggplot(
  wcors_hep_sigLong,
  aes(x = X1, y = value, group = X2)) +
  theme_minimal() + 
  theme(panel.grid = element_blank()) +
  ylim(c(-1,1)) +
  theme(axis.line.x = element_line(color="black", size = 0.5),
        axis.line.y = element_line(color="black", size = 0.5)) +
  xlab("Pseudotime") +
  ylab("Local weighted correlation") +
  NULL
numSummary_df = data.frame(cluster = sort(unique(wcors_hep_sigLong$cluster)),
                           num = tapply(wcors_hep_sigLong$X2, wcors_hep_sigLong$cluster, length)/max(wcors_hep_sigLong$X1))
g1 = g + 
  geom_line() + 
  theme(axis.text.x = element_blank()) +
  theme(axis.line.y = element_line()) +
  facet_wrap(~cluster, ncol = length(unique(wcors_hep_sigLong$cluster)), scales = "free") +
  theme(strip.text = element_text(size = 12)) +
  geom_text(data = numSummary_df, 
            aes(label = paste0(num, " gene pairs")), 
            x = 200, y = -0.8, inherit.aes = FALSE) + 
  xlab("Pseudotime Ranking") +
  theme(axis.title = element_text(size = 15)) +
  theme(axis.line.y = element_line()) +
  scale_y_continuous(breaks = c(-1,0,1), labels = c(-1,0,1), limits = c(-1, 1)) +
  geom_hline(yintercept = 0, colour = "black", linetype = "solid", size = 0.5) +
  geom_vline(xintercept = n_first, colour = "darkblue", linetype = "dashed", size = 0.5) +
  NULL
g1

ggsave(g1, file = "output/hep_wcors_all_clustered.pdf", height = 3.5, width = 1.6*length(unique(wcors_hep_sigLong$cluster)))

wcors_hep_sig_clusterGenes_membership = do.call(cbind, lapply(wcors_hep_sig_clusterGenes_list, function(x){
  v = 1*(sort(unique(unlist(wcors_hep_sig_clusterGenes_list))) %in% x)
  names(v) <- sort(unique(unlist(wcors_hep_sig_clusterGenes_list)))
  return(v)
}))

jacDist = (t(wcors_hep_sig_clusterGenes_membership) %*% wcors_hep_sig_clusterGenes_membership) / (nrow(wcors_hep_sig_clusterGenes_membership) - ((1 - t(wcors_hep_sig_clusterGenes_membership)) %*% (1 - wcors_hep_sig_clusterGenes_membership)))

heatmap.2(jacDist, trace = "n", main = "Jaccard distance of genes within clusters", 
          density.info = "none",
          key.title = "",
          key.xlab = "Jaccard distance",
          symm = TRUE, 
          revC = TRUE,
          col = colorRampPalette(c("black","yellow")))

Print this out for significantly differentially correlated gene pairs, along with cluster ID.

hep_p_all$sigCluster = wcors_hep_sig_clusterGenes[rownames(hep_p_all)]
hep_p_all$startCor = wcors_hep[rownames(hep_p_all),1]
hep_p_all$branchpointCor = wcors_hep[rownames(hep_p_all),n_first]
hep_p_all$endCor = wcors_hep[rownames(hep_p_all),ncol(wcors_hep)]

hep_p_all_sorted = reshape::sort_df(hep_p_all, "fdr")
write.table(hep_p_all_sorted, file = "output/hep_p_all_sorted.tsv", sep = "\t",
            row.names = FALSE, col.names = TRUE, quote = FALSE)
write.table(subset(hep_p_all_sorted, fdr < FDR_level), file = "output/hep_p_all_sorted_sig.tsv", sep = "\t",
            row.names = FALSE, col.names = TRUE, quote = FALSE)

GO testing for hepatocyte branch

GO_list_hep = GO_list[unlist(lapply(GO_list, function(x) any(x %in% nonDE_HVG_hep)))]

if (!file.exists("output/hep_superclusterGenes_GO.Rds")) {
  hep_superclusterGenes_GO = lapply(wcors_hep_sig_clusterGenes_list, function(set) {
    print("testing..")
    genesetGOtest(set, rownames(liver), GO_list_hep)
  }
  )
  saveRDS(hep_superclusterGenes_GO, file = "output/hep_superclusterGenes_GO.Rds")
} else {
  hep_superclusterGenes_GO = readRDS("output/hep_superclusterGenes_GO.Rds")
}

gList = lapply(hep_superclusterGenes_GO, function(pval) {
  df = data.frame(term = factor(names(pval), levels = c(names(pval), "")),
                  pval = pval,
                  qval = p.adjust(pval, method = "BH"))
  df$label = df$term
  df$label[pval != 1] <- ""
  df_sorted = reshape::sort_df(df, "pval")[1:10,]
  df_sorted$term = factor(df_sorted$term, levels =  rev(df_sorted$term))
  
  g = ggplot(df_sorted, aes(x = term, y = -log10(pval), fill = qval < 0.05)) +
    theme_classic() +
    geom_col() +
    coord_flip() +
    xlab("") +
    ylab(expression("-log10(P-value)")) +
    # geom_hline(yintercept = -log10(0.01), colour = "red", linetype = "dashed", size = 1.2) +
    scale_fill_manual(values = c("TRUE" = "red", "FALSE" = "black")) +
    theme(legend.position = "none") +
    NULL
  return(g)
})

gList_named = sapply(seq_len(length(gList)), function(i){
  g = gList[[i]] +
    scale_fill_manual(values = c("TRUE" = "dimgrey", "FALSE" = "peachpuff")) +
    ggtitle(paste0("Cluster ",names(gList)[i])) +
    theme(axis.text.y = element_text(size = 10)) +
    theme(title = element_text(hjust = 0.5)) +
    scale_x_discrete(labels = function(x) stringr::str_wrap(x, width = 40)) +
    NULL
  ggsave(g, file = paste0("output/hep_superclusters_GO_", i, ".pdf"),
         height = 4, width = 7)
  return(g)
}, simplify = FALSE)
g_named = patchwork::wrap_plots(gList_named, nrow = 3)
ggsave(g_named, file = "output/hep_superclusters_GO.pdf",height = 10, width = 16)

Interrogate significant genepairs for cholangiocyte branch

pairs_chol_sig = pairs_chol_all[rownames(subset(chol_p_all, fdr < FDR_level)),]
dim(pairs_chol_sig)
## [1] 78  2
wcors_chol_sig = wcors_chol[rownames(pairs_chol_sig),]

wcors_chol_sig_smooth = t(apply(wcors_chol_sig,1,function(x){
  loess(x ~ I(1:length(x)))$fitted
}))

hc = hclust(dist(wcors_chol_sig_smooth, method = "maximum"), method = "complete")

wcors_chol_sig_clusterGenes = cutreeDynamic(
  hc, 
  minClusterSize = 5, 
  method = "tree",
  deepSplit = TRUE,
  useMedoids = TRUE
)

length(unique(wcors_chol_sig_clusterGenes))
## [1] 11
wcors_chol_sig_clusterGenes = cutree(hc, k = length(unique(wcors_chol_sig_clusterGenes))-1)
names(wcors_chol_sig_clusterGenes) <- hc$labels

saveRDS(wcors_chol_sig_clusterGenes, file = "output/wcors_chol_sig_clusterGenes.Rds")

length(unique(wcors_chol_sig_clusterGenes))
## [1] 10
wcors_chol_sig_clusterGenes_list = lapply(split(names(wcors_chol_sig_clusterGenes), wcors_chol_sig_clusterGenes), function(h) sort(unique(c(pairs_chol_all[h,]))))
wcors_chol_sig_mean = apply(wcors_chol_sig, 2, function(x){
  tapply(x,wcors_chol_sig_clusterGenes, mean)
})
heatmap.2(wcors_chol_sig, trace = "n", Colv = FALSE, 
          col = colorRampPalette(c("blue","white","red"))(100),
          margins = c(8,8), 
          RowSideColors = tol12qualitative[wcors_chol_sig_clusterGenes])

matplot(t(wcors_chol_sig_mean), type = "l", lwd = 3)

wcors_chol_sig_clusterGenes_membership = do.call(cbind, lapply(wcors_chol_sig_clusterGenes_list, function(x){
  v = 1*(sort(unique(unlist(wcors_chol_sig_clusterGenes_list))) %in% x)
  names(v) <- sort(unique(unlist(wcors_chol_sig_clusterGenes_list)))
  return(v)
}))

jacDist = (t(wcors_chol_sig_clusterGenes_membership) %*% wcors_chol_sig_clusterGenes_membership) / (nrow(wcors_chol_sig_clusterGenes_membership) - ((1 - t(wcors_chol_sig_clusterGenes_membership)) %*% (1 - wcors_chol_sig_clusterGenes_membership)))

heatmap.2(jacDist, trace = "n", main = "Jaccard distance of genes within clusters", 
          density.info = "none",
          key.title = "",
          key.xlab = "Jaccard distance",
          symm = TRUE, 
          revC = TRUE,
          col = colorRampPalette(c("black","yellow")))

sigOrder = rev(paste0("Cluster ",rownames(wcors_chol_sig_mean)[heatmap.2(wcors_chol_sig_mean)$rowInd]))

wcors_chol_sigLong = reshape::melt(t(wcors_chol_sig))
wcors_chol_sigLong$cluster = paste0("Cluster ", wcors_chol_sig_clusterGenes[as.character(wcors_chol_sigLong$X2)])
wcors_chol_sigLong$cluster <- factor(wcors_chol_sigLong$cluster, 
                                     levels = sigOrder
)
gh = geom_hline(yintercept = 0, linetype = "dotted", colour = "red", size = 1)
g = ggplot(
  wcors_chol_sigLong,
  aes(x = X1, y = value, group = X2)) +
  theme_minimal() + 
  theme(panel.grid = element_blank()) +
  ylim(c(-1,1)) +
  theme(axis.line.x = element_line(color="black", size = 0.5),
        axis.line.y = element_line(color="black", size = 0.5)) +
  xlab("Pseudotime") +
  ylab("Local weighted correlation") +
  NULL
numSummary_df = data.frame(cluster = sort(unique(wcors_chol_sigLong$cluster)),
                           num = tapply(wcors_chol_sigLong$X2, wcors_chol_sigLong$cluster, length)/max(wcors_chol_sigLong$X1))
g1 = g + 
  geom_line() + 
  theme(axis.text.x = element_blank()) +
  theme(axis.line.y = element_line()) +
  facet_wrap(~cluster, ncol = 10, scales = "free") +
  theme(strip.text = element_text(size = 12)) +
  geom_text(data = numSummary_df, 
            aes(label = paste0(num, " gene pairs")), 
            x = 200, y = -0.8, inherit.aes = FALSE) + 
  xlab("Pseudotime Ranking") +
  theme(axis.title = element_text(size = 15)) +
  theme(axis.line.y = element_line()) +
  scale_y_continuous(breaks = c(-1,0,1), labels = c(-1,0,1), limits = c(-1, 1)) +
  geom_hline(yintercept = 0, colour = "black", linetype = "solid", size = 0.5) +
  geom_vline(xintercept = n_first, colour = "darkblue", linetype = "dashed", size = 0.5) +
  NULL
g1

ggsave(g1, file = "output/chol_wcors_all_clustered.pdf", height = 3.5, width = 1.6*length(unique(wcors_chol_sigLong$cluster)))

GO testing for cholangiocyte branch

GO_list_chol = GO_list[unlist(lapply(GO_list, function(x) any(x %in% nonDE_HVG_chol)))]

if (!file.exists("output/chol_superclusterGenes_GO.Rds")) {
  chol_superclusterGenes_GO = lapply(wcors_chol_sig_clusterGenes_list, function(set) {
    print("testing..")
    genesetGOtest(set, rownames(liver), GO_list_chol)
  }
  )
  saveRDS(chol_superclusterGenes_GO, file = "output/chol_superclusterGenes_GO.Rds")
} else {
  chol_superclusterGenes_GO = readRDS("output/chol_superclusterGenes_GO.Rds")
}

gList = lapply(chol_superclusterGenes_GO, function(pval) {
  df = data.frame(term = factor(names(pval), levels = c(names(pval), "")),
                  pval = pval,
                  qval = p.adjust(pval, method = "BH"))
  df$label = df$term
  df$label[pval != 1] <- ""
  df_sorted = reshape::sort_df(df, "pval")[1:10,]
  df_sorted$term = factor(df_sorted$term, levels =  rev(df_sorted$term))
  
  g = ggplot(df_sorted, aes(x = term, y = -log10(pval), fill = qval < 0.05)) +
    theme_classic() +
    geom_col() +
    coord_flip() +
    xlab("") +
    ylab(expression("-log10(P-value)")) +
    # geom_hline(yintercept = -log10(0.01), colour = "red", linetype = "dashed", size = 1.2) +
    scale_fill_manual(values = c("TRUE" = "red", "FALSE" = "black")) +
    theme(legend.position = "none") +
    NULL
  return(g)
})

gList_named = sapply(seq_len(length(gList)), function(i){
  g = gList[[i]] +
    scale_fill_manual(values = c("TRUE" = "dimgrey", "FALSE" = "peachpuff")) +
    ggtitle(paste0("Cluster ",names(gList)[i])) +
    theme(axis.text.y = element_text(size = 10)) +
    theme(title = element_text(hjust = 0.5)) +
    scale_x_discrete(labels = function(x) stringr::str_wrap(x, width = 40)) +
    NULL
  ggsave(g, file = paste0("output/chol_superclusters_GO_", i, ".pdf"),
         height = 4, width = 7)
  return(g)
}, simplify = FALSE)
g_named = patchwork::wrap_plots(gList_named, nrow = 3)
ggsave(g_named, file = "output/chol_superclusters_GO.pdf",height = 10, width = 16)

Is observed differential correlation independent of observed differential variability?

dv_genes_hep = names(which(hep_DV_fdr < 0.1))
dc_genes_hep = sort(unique(c(pairs_hep_sig)))

hep_genepair_type = data.frame(
  genepair = hep_p_all$genepair,
  gene1 = pairs_hep_all[,1],
  gene2 = pairs_hep_all[,2],
  sig = ifelse(hep_p_all$fdr < FDR_level, "DC", "not DC")
)
hep_genepair_type$gene1DV = hep_genepair_type$gene1 %in% dv_genes_hep
hep_genepair_type$gene2DV = hep_genepair_type$gene2 %in% dv_genes_hep
hep_genepair_type$type_sum = hep_genepair_type$gene1DV + hep_genepair_type$gene2DV
hep_genepair_type$type = "not DV - not DV"
hep_genepair_type$type[hep_genepair_type$type_sum == 1] <- "not DV - DV"
hep_genepair_type$type[hep_genepair_type$type_sum == 2] <- "DV - DV"

addmargins(table(hep_genepair_type$type, hep_genepair_type$sig))
##                  
##                      DC not DC   Sum
##   DV - DV           145  13221 13366
##   not DV - DV        70   7638  7708
##   not DV - not DV     9   1072  1081
##   Sum               224  21931 22155
fisher.test(table(hep_genepair_type$type, hep_genepair_type$sig))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(hep_genepair_type$type, hep_genepair_type$sig)
## p-value = 0.423
## alternative hypothesis: two.sided
chisq.test(table(hep_genepair_type$type, hep_genepair_type$sig))$residuals
##                  
##                            DC      not DC
##   DV - DV          0.84834585 -0.08573689
##   not DV - DV     -0.89855533  0.09081124
##   not DV - not DV -0.58365099  0.05898587
# Now for cholangiocyte branch

dv_genes_chol = names(which(chol_DV_fdr < 0.1))
dc_genes_chol = sort(unique(c(pairs_chol_sig)))

chol_genepair_type = data.frame(
  genepair = chol_p_all$genepair,
  gene1 = pairs_chol_all[,1],
  gene2 = pairs_chol_all[,2],
  sig = ifelse(chol_p_all$fdr < FDR_level, "DC", "not DC")
)
chol_genepair_type$gene1DV = chol_genepair_type$gene1 %in% dv_genes_chol
chol_genepair_type$gene2DV = chol_genepair_type$gene2 %in% dv_genes_chol
chol_genepair_type$type_sum = chol_genepair_type$gene1DV + chol_genepair_type$gene2DV
chol_genepair_type$type = "not DV - not DV"
chol_genepair_type$type[chol_genepair_type$type_sum == 1] <- "not DV - DV"
chol_genepair_type$type[chol_genepair_type$type_sum == 2] <- "DV - DV"

addmargins(table(chol_genepair_type$type, chol_genepair_type$sig))
##                  
##                      DC not DC   Sum
##   DV - DV            49   7952  8001
##   not DV - DV        28   3782  3810
##   not DV - not DV     1    434   435
##   Sum                78  12168 12246
fisher.test(table(chol_genepair_type$type, chol_genepair_type$sig))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(chol_genepair_type$type, chol_genepair_type$sig)
## p-value = 0.466
## alternative hypothesis: two.sided
chisq.test(table(chol_genepair_type$type, chol_genepair_type$sig))$residuals
##                  
##                            DC      not DC
##   DV - DV         -0.27480761  0.02200222
##   not DV - DV      0.75767909 -0.06066288
##   not DV - not DV -1.06377638  0.08517027

Branch comparison between hepatocyte and cholangiocyte branches

both_genes = sort(union(c(pairs_hep_all), c(pairs_chol_all)))

getStrength = function(p_all, pairs, genes) {
  
  sumStats = sum(-log10(p_all$fdr))
  
  strength = sapply(genes, function(gene) {
    sub_p = p_all[pairs[,1] == gene | pairs[,2] == gene,]
    sum((-log10(subset(sub_p, fdr < FDR_level)$fdr)/sumStats))
  })
  strength[is.na(strength)] <- 0
  
  numZero = sum(strength == 0)
  
  return(strength)
}

strength_hep = getStrength(hep_p_all, pairs_hep_all, both_genes)
strength_chol = getStrength(chol_p_all, pairs_chol_all, both_genes)

strength_A = strength_hep + strength_chol
strength_M = strength_chol - strength_hep

# random cloud of points 
if (!file.exists("output/sigCounts.Rds")) {
  set.seed(500)
  sigCounts = replicate(10000, {
    
    hep_p_all_fake = hep_p_all
    chol_p_all_fake = chol_p_all
    
    fake_index_hep = sample(nrow(hep_p_all))
    fake_index_chol = sample(nrow(chol_p_all))
    
    hep_p_all_fake$stat = hep_p_all$stat[fake_index_hep]
    hep_p_all_fake$fdr = hep_p_all$fdr[fake_index_hep]
    chol_p_all_fake$stat = chol_p_all$stat[fake_index_chol]
    chol_p_all_fake$fdr = chol_p_all$fdr[fake_index_chol]
    
    strength_hep_fake = getStrength(hep_p_all_fake, pairs_hep_all, both_genes)
    strength_chol_fake = getStrength(chol_p_all_fake, pairs_chol_all, both_genes)
    
    strength_A_fake = strength_hep_fake + strength_chol_fake
    strength_M_fake = strength_chol_fake - strength_hep_fake
    
    cos_angle = strength_M_fake / strength_A_fake
    
    sigCount = 1*(sqrt(strength_A_fake^2 + strength_M_fake^2) >= sqrt(strength_A^2 + strength_M^2))
    return(sigCount)
  })
  saveRDS(sigCounts, file = "output/sigCounts.Rds")
} else {
  sigCounts = readRDS("output/sigCounts.Rds")
}

if (!file.exists("output/random_cos_angle.Rds")) {
  set.seed(500)
  random_cos_angle = replicate(10000, {
    
    hep_p_all_fake = hep_p_all
    chol_p_all_fake = chol_p_all
    
    fake_index_hep = sample(nrow(hep_p_all))
    fake_index_chol = sample(nrow(chol_p_all))
    
    hep_p_all_fake$stat = hep_p_all$stat[fake_index_hep]
    hep_p_all_fake$fdr = hep_p_all$fdr[fake_index_hep]
    chol_p_all_fake$stat = chol_p_all$stat[fake_index_chol]
    chol_p_all_fake$fdr = chol_p_all$fdr[fake_index_chol]
    
    strength_hep_fake = getStrength(hep_p_all_fake, pairs_hep_all, both_genes)
    strength_chol_fake = getStrength(chol_p_all_fake, pairs_chol_all, both_genes)
    
    strength_A_fake = strength_hep_fake + strength_chol_fake
    strength_M_fake = strength_chol_fake - strength_hep_fake
    
    cos_angle = strength_M_fake / strength_A_fake
    return(cos_angle)
  })
  saveRDS(random_cos_angle, file = "output/random_cos_angle.Rds")
} else {
  random_cos_angle = readRDS("output/random_cos_angle.Rds")
}

MA_pval = rowMeans(sigCounts)

cos_angle = strength_M / strength_A
dim(random_cos_angle)
## [1]   257 10000
angle_pval = sapply(names(cos_angle), function(gene) {
  mean(abs(random_cos_angle[gene,]) > abs(cos_angle[gene]), na.rm = TRUE)
})
# small p-value means branch-specific

table(angle_pval > 0.05 & angle_pval < 0.95, p.adjust(MA_pval, method = "BH") < 0.05)
##        
##         FALSE TRUE
##   FALSE   132   14
##   TRUE     23    5
names(which(angle_pval > 0.05 & angle_pval < 0.95 & p.adjust(MA_pval, method = "BH") < 0.05))
## [1] "Bex1"     "Cdt1"     "Fth1"     "Gstm1"    "Ivns1abp"
diffGenes = names(which(angle_pval > 0.05 & angle_pval < 0.95 & p.adjust(MA_pval, method = "BH") < 0.05))

strength_df = data.frame(
  gene = both_genes,
  strength_A = strength_A,
  strength_M = strength_M,
  MA_pval = MA_pval,
  MA_fdr = p.adjust(MA_pval, method = "BH"),
  angle_pval = angle_pval,
  angle_fdr = p.adjust(angle_pval, method = "BH")
)

strength_df$col = "none"
strength_df$col[strength_df$MA_fdr < 0.05 & strength_df$angle_fdr > 0.05] <- "red"
strength_df$col[strength_df$MA_fdr < 0.05 & strength_df$angle_fdr < 0.05] <- "black"

g = ggplot(strength_df, aes(x = strength_A, y = -strength_M)) + 
  # geom_point(colour = "grey") +
  geom_point(aes(colour = col)) +
  theme_minimal() + 
  geom_text_repel(aes(label = gene, colour = col),
                  data = subset(strength_df, col != "none"),
                  fontface = "italic",
                  size = 5) +
  geom_hline(yintercept = 0) +
  theme(panel.grid = element_blank()) +
  theme(axis.title = element_text(size = 13)) +
  xlab("Gene strength sum") +
  ylab("Gene strength difference (Hepatocyte - Cholangiocyte)") +
  geom_segment(aes(x = x, y = y, xend = xend, yend = yend), arrow = arrow(), 
               data = data.frame(x = c(0,0), xend = c(0,0),
                                 y = c(0,0), yend = c(0.01,-0.01))) +
  geom_text(aes(x = x, y = y, label = label),
            data = data.frame(
              x = c(0.0017,0.002),
              y = c(0.01, -0.01),
              label = c("Hepatocyte\nbranch","Cholangiocyte\nbranch")
            )) +
  # theme(axis.text = element_blank()) +
  theme(legend.position = "none") +
  scale_colour_manual(values = c("black" = "black", "red" = "red", "none" = "grey"),
                      labels = c("black" = "Branch specific",
                                 "red" = "Common to either branch")) +
  labs(colour = "") +
  geom_text(data = data.frame(x = rep(0.008,2), y = -0.011 + c(0,2e-3),
                              label = c("Branch specific","Common to \neither branch"), col = c("black","red")),
            inherit.aes = FALSE,
            aes(x = x, y = y, label = label, colour = col),
            hjust = 0.5,
            fontface = "bold") +
  NULL
g

ggsave(g, file = "output/hep_chol_branch_comparison.pdf", height = 5, width = 5)

Misc: Cdt1 and Top2a graph

scales::show_col(tableau_color_pal("Tableau 10")(3))

branchcols = tableau_color_pal("Tableau 10")(3)
names(branchcols) <- c("Hepatocyte","Cholangiocyte","Hepatoblast")

df = data.frame(gene1 = liver_branch_hep["Cdt1",],
                gene2 = liver_branch_hep["Top2a",],
                rank = rank(liver_pseudotime_hep))
W_hep_weight = W_hep
rownames(W_hep_weight) <- paste0("Weight_", 1:nrow(W_hep_weight))
df = cbind(df, t(W_hep_weight))

df$pseudotime <- liver_pseudotime_hep
df$mean_gene1 <- apply(W_hep, 1, function(w) weightedMean(liver_branch_hep["Cdt1",], w))
df$mean_gene2 <- apply(W_hep, 1, function(w) weightedMean(liver_branch_hep["Top2a",], w))
df$var_gene1 <- hep_DV_wVars["Cdt1",]
df$var_gene2 <- hep_DV_wVars["Top2a",]
df$sd_gene1 <- sqrt(df$var_gene1)
df$sd_gene2 <- sqrt(df$var_gene2)
df$lower_gene1 = df$mean_gene1 - df$sd_gene1
df$upper_gene1 = df$mean_gene1 + df$sd_gene1
df$lower_gene2 = df$mean_gene2 - df$sd_gene2
df$upper_gene2 = df$mean_gene2 + df$sd_gene2

g1 = ggplot(df, aes(x = rank, y = gene1, colour = rank)) + 
  geom_point() +
  geom_ribbon(aes(ymin = lower_gene1, ymax = upper_gene1), alpha = 0.1, colour = NA) +
  geom_line(aes(y = mean_gene1), size = 2) + 
  theme_minimal() +
  theme(legend.position = "none") +
  theme(panel.grid = element_blank()) +
  theme(axis.title = element_text(size = 15)) +
  xlab("") +
  ylab(expression(paste(italic("Cdt1"), " Expression"))) +
  ggtitle("") +
  theme(plot.title = element_text(hjust = 0.5, size = 20)) +
  scale_colour_gradient(low = branchcols["Hepatoblast"],
                        high = branchcols["Hepatocyte"]) +
  theme(plot.margin = unit(c(0,0,0,0), "in")) +
  theme(axis.line.x = element_line(color="black", size = 0.5),
        axis.line.y = element_line(color="black", size = 0.5)) +
  NULL
g1

g2 = ggplot(df, aes(x = rank, y = gene2, colour = rank)) + 
  geom_point() +
  geom_ribbon(aes(ymin = lower_gene2, ymax = upper_gene2), alpha = 0.1, colour = NA) +
  geom_line(aes(y = mean_gene2), size = 2) + 
  theme_minimal() +
  theme(legend.position = "none") +
  theme(panel.grid = element_blank()) +
  theme(axis.title = element_text(size = 15)) +
  xlab("Pseudotime Ranking") +
  ylab(expression(paste(italic("Top2a"), " Expression"))) +
  ggtitle("") +
  theme(plot.title = element_text(hjust = 0.5, size = 5)) +
  scale_colour_gradient(low = branchcols["Hepatoblast"],
                        high = branchcols["Hepatocyte"]) +
  theme(plot.margin = unit(c(0,0,0,0.2), "in")) +
  theme(axis.line.x = element_line(color="black", size = 0.5),
        axis.line.y = element_line(color="black", size = 0.5)) +
  NULL
g2

g = g1 + g2 + plot_layout(ncol = 2)
ggsave(g, file = "output/Cdt1_Top2a_ribbonPlot.pdf",height = 2.3, width = 12)

gList = sapply(paste0("Weight_", round(seq(1,nrow(W_hep_weight), length.out = 5))), function(weight) {
  ggplot(df, aes(x = gene1, y = gene2)) +
    geom_point(aes(alpha = get(weight),
                   size = get(weight),
                   colour = rank),
               pch = 16) + # , size = get(weight)
    theme_classic() +
    theme(panel.grid = element_blank()) +
    theme(axis.line.x = element_line(color="black", size = 1),
          axis.line.y = element_line(color="black", size = 1)) +
    theme(legend.position = "none") +
    geom_density2d(data = subset(df, get(weight) > 0), colour = "black",
                   size = 0.5, linetype = "solid",
                   bins = 5) +
    xlab("") +
    ylab("") +
    xlim(c(0,10.2)) +
    scale_colour_gradient(low = branchcols["Hepatoblast"],
                          high = branchcols["Hepatocyte"]) +
    scale_size_continuous(range = c(1, 4)) +
    scale_alpha_continuous(range = c(0.5, 1)) +
    theme(axis.text = element_text(size = 15)) +
    theme(axis.title = element_text(size = 20)) +
    xlab(ifelse(weight == "Weight_1", expression(italic("Cdt1")), "")) +
    ylab(ifelse(weight == "Weight_1", expression(italic("Top2a")), "")) +
    ggtitle(
      paste0("Local cor: ", 
             round(weightedSpearman(df$gene1, df$gene2, w = df[,weight]), 2))
    ) +
    theme(plot.title = element_text(size = 20)) +
    NULL
}, simplify = FALSE)

g = wrap_plots(gList, nrow = 1)
g

sapply(names(gList), function(n){
  ggsave(gList[[n]], file = paste0("output/Top2a_Cdt1_density_scatterplots_", n,".pdf"),height = 3.6, width = 3.6)
})  
## $Weight_1
## NULL
## 
## $Weight_103
## NULL
## 
## $Weight_204
## NULL
## 
## $Weight_306
## NULL
## 
## $Weight_408
## NULL
ggsave(g, file = "output/Top2a_Cdt1_density_scatterplots.pdf",height = 3.5, width = 14)
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-apple-darwin18.6.0 (64-bit)
## Running under: macOS Mojave 10.14.6
## 
## Matrix products: default
## BLAS/LAPACK: /usr/local/Cellar/openblas/0.3.6_1/lib/libopenblasp-r0.3.6.dylib
## 
## locale:
## [1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8
## 
## attached base packages:
##  [1] splines   grid      stats4    parallel  stats     graphics  grDevices
##  [8] utils     datasets  methods   base     
## 
## other attached packages:
##  [1] mgcv_1.8-31                 nlme_3.1-144               
##  [3] energy_1.7-7                minerva_1.5.8              
##  [5] UpSetR_1.4.0                GGally_1.4.0               
##  [7] circlize_0.4.8              corrplot_0.84              
##  [9] scMerge_1.2.0               M3Drop_1.12.0              
## [11] numDeriv_2016.8-1.1         ggridges_0.5.2             
## [13] ggrepel_0.8.1               plotly_4.9.2               
## [15] gtools_3.8.1                igraph_1.2.4.2             
## [17] ggthemes_4.2.0              monocle_2.14.0             
## [19] DDRTree_0.1.5               irlba_2.3.3                
## [21] VGAM_1.1-2                  Matrix_1.2-18              
## [23] cowplot_1.0.0               ggpubr_0.2.5               
## [25] magrittr_1.5                patchwork_1.0.0            
## [27] ggforce_0.3.1               stringr_1.4.0              
## [29] org.Mm.eg.db_3.10.0         GO.db_3.10.0               
## [31] AnnotationDbi_1.48.0        ComplexHeatmap_2.2.0       
## [33] dynamicTreeCut_1.63-1       scattermore_0.4            
## [35] reshape_0.8.8               scran_1.14.6               
## [37] scater_1.14.6               SingleCellExperiment_1.8.0 
## [39] SummarizedExperiment_1.16.1 DelayedArray_0.12.2        
## [41] BiocParallel_1.20.1         matrixStats_0.55.0         
## [43] Biobase_2.46.0              GenomicRanges_1.38.0       
## [45] GenomeInfoDb_1.22.0         IRanges_2.20.2             
## [47] S4Vectors_0.24.3            BiocGenerics_0.32.0        
## [49] gplots_3.0.1.2              ggplot2_3.2.1              
## [51] DCARS_0.3.5                
## 
## loaded via a namespace (and not attached):
##   [1] tidyselect_1.0.0         RSQLite_2.2.0            htmlwidgets_1.5.1       
##   [4] combinat_0.0-8           docopt_0.6.1             Rtsne_0.15              
##   [7] munsell_0.5.0            codetools_0.2-16         statmod_1.4.34          
##  [10] withr_2.1.2              colorspace_1.4-1         fastICA_1.2-2           
##  [13] knitr_1.28               rstudioapi_0.11          ggsignif_0.6.0          
##  [16] labeling_0.3             slam_0.1-47              bbmle_1.0.23.1          
##  [19] GenomeInfoDbData_1.2.2   polyclip_1.10-0          bit64_0.9-7             
##  [22] farver_2.0.3             pheatmap_1.0.12          vctrs_0.2.2             
##  [25] xfun_0.12                R6_2.4.1                 ggbeeswarm_0.6.0        
##  [28] clue_0.3-57              rsvd_1.0.3               locfit_1.5-9.1          
##  [31] bitops_1.0-6             assertthat_0.2.1         scales_1.1.0            
##  [34] nnet_7.3-12              startupmsg_0.9.6         beeswarm_0.2.3          
##  [37] gtable_0.3.0             rlang_0.4.4              GlobalOptions_0.1.1     
##  [40] lazyeval_0.2.2           acepack_1.4.1            checkmate_2.0.0         
##  [43] yaml_2.2.1               reshape2_1.4.3           backports_1.1.5         
##  [46] Hmisc_4.3-1              tools_3.6.1              RColorBrewer_1.1-2      
##  [49] proxy_0.4-23             Rcpp_1.0.3               plyr_1.8.5              
##  [52] base64enc_0.1-3          zlibbioc_1.32.0          purrr_0.3.3             
##  [55] RCurl_1.98-1.1           densityClust_0.3         rpart_4.1-15            
##  [58] reldist_1.6-6            GetoptLong_0.1.8         viridis_0.5.1           
##  [61] sfsmisc_1.1-5            cluster_2.1.0            data.table_1.12.8       
##  [64] RANN_2.6.1               mvtnorm_1.0-12           distr_2.8.0             
##  [67] evaluate_0.14            jpeg_0.1-8.1             sparsesvd_0.2           
##  [70] gridExtra_2.3            shape_1.4.4              HSMMSingleCell_1.6.0    
##  [73] compiler_3.6.1           bdsmatrix_1.3-4          tibble_2.1.3            
##  [76] KernSmooth_2.23-16       crayon_1.3.4             htmltools_0.4.0         
##  [79] Formula_1.2-3            tidyr_1.0.2              DBI_1.1.0               
##  [82] tweenr_1.0.1             MASS_7.3-51.5            boot_1.3-24             
##  [85] gdata_2.18.0             pkgconfig_2.0.3          foreign_0.8-75          
##  [88] vipor_0.4.5              dqrng_0.2.1              XVector_0.26.0          
##  [91] ruv_0.9.7.1              digest_0.6.24            rmarkdown_2.1           
##  [94] htmlTable_1.13.3         edgeR_3.28.0             DelayedMatrixStats_1.8.0
##  [97] rjson_0.2.20             lifecycle_0.1.0          jsonlite_1.6.1          
## [100] BiocNeighbors_1.4.1      viridisLite_0.3.0        limma_3.42.2            
## [103] pillar_1.4.3             lattice_0.20-40          httr_1.4.1              
## [106] survival_3.1-8           glue_1.3.1               qlcMatrix_0.9.7         
## [109] FNN_1.1.3                png_0.1-7                bit_1.1-15.2            
## [112] stringi_1.4.6            blob_1.2.1               BiocSingular_1.2.2      
## [115] latticeExtra_0.6-29      caTools_1.18.0           memoise_1.1.0           
## [118] dplyr_0.8.4